Пример #1
0
def data_initialization(train_file, dev_file, test_file, emb_file):
    data = Data()
    data.number_normalized = True
    data.build_alphabet(train_file)
    data.build_alphabet(dev_file)
    data.build_alphabet(test_file)
    data.fix_alphabet()
    data.generate_instance(train_file,'train')
    data.generate_instance(dev_file,'dev')
    data.generate_instance(test_file,'test')
    data.build_word_pretrain_emb(emb_file)
    return data
Пример #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--do_train", default=False, action='store_true')
    parser.add_argument("--do_eval", default=False, action='store_true')
    parser.add_argument('--do_predict', default=False, action='store_true')
    parser.add_argument('--markup', default='bmeso', type=str)
    parser.add_argument("--arch", default='transformer', type=str)
    parser.add_argument('--learning_rate', default=0.001, type=float)
    parser.add_argument('--seed', default=1234, type=int)
    parser.add_argument('--gpu', default='0', type=str)
    parser.add_argument('--epochs', default=100, type=int)
    parser.add_argument('--batch_size', default=16, type=int)
    # parser.add_argument('--hidden_size', default=512, type=int)
    parser.add_argument("--grad_norm",
                        default=5.0,
                        type=float,
                        help="Max gradient norm.")

    args = parser.parse_args()

    args.data_dir = config.data_dir
    args.output_dir = config.output_dir / Path(
        '{}'.format(str(config.dataset) + '-on-lstm'))
    if not args.output_dir.exists():
        args.output_dir.mkdir()
    init_logger(log_file=str(args.output_dir / '{}.log'.format(args.arch)))
    seed_everything(args.seed)

    # 设置gpu
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    if os.path.exists(config.save_data_name):
        print('Loading processed data')
        with open(config.save_data_name, 'rb') as fp:
            data = pickle.load(fp)
    else:
        data = Data()
        data_initialization(data, config.gaz_file, config.train_path,
                            config.dev_path, config.test_path)
        data.generate_instance_with_gaz(config.train_path, 'train')
        data.generate_instance_with_gaz(config.dev_path, 'dev')
        data.generate_instance_with_gaz(config.test_path, 'test')
        data.build_word_pretrain_emb(config.char_emb)
        data.build_biword_pretrain_emb(config.bichar_emb)
        data.build_gaz_pretrain_emb(config.gaz_file)
        print('Dumping data')
        with open(config.save_data_name, 'wb') as f:
            pickle.dump(data, f)
    if args.do_train:
        train(args, data, NERModel)

    if args.do_predict:
        predict(args, data, NERModel, 'test')
Пример #3
0
def data_initialization(train_file, dev_file, test_file, emb_file):
    data = Data()
    data.number_normalized = True
    data.build_alphabet(train_file)
    data.build_alphabet(dev_file)
    data.build_alphabet(test_file)
    data.fix_alphabet()
    data.generate_instance(train_file,'train')
    data.generate_instance(dev_file,'dev')
    data.generate_instance(test_file,'test')
    word_emb_norm = False
    data.build_word_pretrain_emb(emb_file, word_emb_norm)
    data.show_data_summary()
    sys.stdout.flush()
    return data
Пример #4
0
    if status == 'train':
        data = Data()
        data.HP_gpu = gpu
        data.HP_use_char = False
        data.HP_batch_size = 10
        data.use_bigram = False
        data.gaz_dropout = 0.5
        data.norm_gaz_emb = False
        data.HP_fix_gaz_emb = False
        data_initialization(data, gaz_file, train_file, dev_file, test_file)

        data.generate_instance_with_gaz(train_file,'train')
        data.generate_instance_with_gaz(dev_file,'dev')
        data.generate_instance_with_gaz(test_file,'test')

        data.build_word_pretrain_emb(char_emb)
        data.build_biword_pretrain_emb(bichar_emb)
        data.build_gaz_pretrain_emb(gaz_file)
        train(data, save_model_dir,dset_dir, seg)
    elif status == 'test':      
        data = load_data_setting(dset_dir)
        data.generate_instance_with_gaz(dev_file,'dev')
        load_model_decode(model_dir, data , 'dev', gpu, seg)
        data.generate_instance_with_gaz(test_file,'test')
        load_model_decode(model_dir, data, 'test', gpu, seg)
    elif status == 'decode':       
        data = load_data_setting(dset_dir)
        data.generate_instance_with_gaz(raw_file,'raw')
        decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg)
        data.write_decoded_results(output_file, decode_results, 'raw')
    else:
Пример #5
0
    sys.stdout.flush()

    if status == 'train':
        data = Data()
        data.HP_gpu = gpu
        data.HP_use_char = False
        data.HP_batch_size = 1
        data.use_bigram = False
        data.gaz_dropout = 0.5
        data.norm_gaz_emb = False
        data.HP_fix_gaz_emb = False
        data_initialization(data, gaz_file, train_file, dev_file, test_file)
        data.generate_instance_with_gaz(train_file, 'train')
        data.generate_instance_with_gaz(dev_file, 'dev')
        data.generate_instance_with_gaz(test_file, 'test')
        data.build_word_pretrain_emb(char_emb)
        data.build_biword_pretrain_emb(bichar_emb)
        data.build_gaz_pretrain_emb(gaz_file)
        train(data, save_model_dir, seg)
    elif status == 'test':
        data = load_data_setting(dset_dir)
        data.generate_instance_with_gaz(dev_file, 'dev')
        load_model_decode(model_dir, data, 'dev', gpu, seg)
        data.generate_instance_with_gaz(test_file, 'test')
        load_model_decode(model_dir, data, 'test', gpu, seg)
    elif status == 'decode':
        data = load_data_setting(dset_dir)
        data.generate_instance_with_gaz(raw_file, 'raw')
        decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg)
        data.write_decoded_results(output_file, decode_results, 'raw')
    else:
Пример #6
0
class Train_Model:
    def __init__(self):

        self.gaz_file = 'D:\\mygit\\NER_MODEL\\data\\data\\ctb.50d.vec'
        self.char_emb = 'D:\\mygit\\NER_MODEL\\data\\data\\gigaword_chn.all.a2b.uni.ite50.vec'
        self.train_file = 'D:\\mygit\\NER_MODEL\\data\\data\\demo.train.char'
        self.dev_file = 'D:\\mygit\\NER_MODEL\\data\\data\\demo.dev.char'
        self.test_file = 'D:\\mygit\\NER_MODEL\\data\\data\\demo.test.char'
        self.model_save_path = 'D:\\mygit\\NER_MODEL\\models\\ckpt'

        self.batch_size = 64
        self.max_char_len = 100
        self.emb_size = 50
        self.max_lexicon_words_num = 5
        self.num_units = 128
        self.num_tags = 18
        self.learning_rate = 0.005
        self.optimizer = 'adam'
        self.epoch = 0
        self.bichar_emb = None
        self.data = Data()
        self.load_data_and_embedding()
        self.model = Model_Lattice(self.max_char_len, self.emb_size,
                                   self.max_lexicon_words_num, self.num_units,
                                   self.num_tags, self.learning_rate)
        self.saver = tf.train.Saver()

    def train(self, epochs=10):
        init = tf.global_variables_initializer()
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.8
        with tf.Session(config=config) as sess:
            sess.run(init)
            for iter in range(epochs):
                loss = []
                print('iter: ', iter)
                random.shuffle(self.data.train_Ids)
                train_num = len(self.data.train_Ids)
                total_batch = train_num // self.batch_size
                for batch_id in range(total_batch):
                    start = batch_id * self.batch_size
                    end = (batch_id + 1) * self.batch_size

                    if end > train_num:
                        end = train_num

                    instance = self.data.train_Ids[start:end]
                    if not instance:
                        continue

                    self.epoch += 1
                    _, char_ids, lexicon_word_ids, word_length_tensor, _, labels = self.batch_with_label(
                        instance)

                    # run模型
                    feed_dict = {
                        self.model.placeholders["char_ids"]: char_ids,
                        self.model.placeholders["lexicon_word_ids"]:
                        lexicon_word_ids,
                        self.model.placeholders["word_length_tensor"]:
                        word_length_tensor,
                        self.model.placeholders["labels"]: labels,
                    }

                    _, losses, step = sess.run([
                        self.model.train_op, self.model.loss,
                        self.model.global_step
                    ],
                                               feed_dict=feed_dict)
                    loss.append(losses)
                    # print(loss)
                    self.ls = sum(loss) / len(loss)
                if self.epoch % 1 == 0:
                    print('*' * 100)
                    print(self.epoch, 'loss', self.ls)

                    # self.evaluate(sess, data)
                    self.evaluate_line(sess, [
                        '习', '近', '平', '在', '北', '京', '中', '南', '海', '呼', '吁',
                        '美', '国', '加', '强', '合', '作', '共', '创', '美', '好', '生',
                        '活'
                    ])
                self.saver.save(sess,
                                os.path.join(self.model_save_path, "ner.dat"))

    def batch_with_label(self, input_batch_list, is_train=True):
        """
        input: list of words, chars and labels, various length.
            [[words,biwords,chars,gaz,labels], [words,biwords,chars,gaz,labels],...]
            words: word ids for one sentence. (batch_size, sent_len)
            chars: char ids for one sentences, various length. (batch_size, sent_len, each_word_length)
        output:
            char_ids: (batch_size, )
            lexicon_word_ids: (batch_size, )
            word_length_tensor: (batch_size, )
            labels: (batch_size, )
        """
        # batch_size = len(input_batch_list)
        lengths = [
            len(sent[0][0:self.max_char_len]) for sent in input_batch_list
        ]
        chars_ids = [sent[0][0:self.max_char_len] for sent in input_batch_list]
        biwords = [sent[1][0:self.max_char_len] for sent in input_batch_list]
        # chars_ids_split = [sent[2][0:self.max_char_len] for sent in input_batch_list]
        # lexicon_words = [sent[3][0:self.max_char_len] for sent in input_batch_list]

        if is_train:
            target = [
                sent[4][0:self.max_char_len] for sent in input_batch_list
            ]

        chars_ids = list(
            map(lambda l: l + [0] * (self.max_char_len - len(l)), chars_ids))
        # biwords = list(map(lambda l: l + [0] * (self.max_char_len - len(l)), biwords))

        if is_train:
            labels = list(
                map(lambda l: l + [0] * (self.max_char_len - len(l)), target))

        lexicon_word_ids = []
        word_length_tensor = []
        for sent in input_batch_list:
            lexicon_word_ids_sent = []
            word_length_tensor_sent = []

            for word_lexicon in sent[3][0:self.max_char_len]:
                word_lexicon_pad = list(
                    map(
                        lambda l: l + [0] *
                        (self.max_lexicon_words_num - len(l)), word_lexicon))
                lexicon_word_ids_sent.append(
                    word_lexicon_pad[0][0:self.max_lexicon_words_num])  # id
                word_length_tensor_sent.append(
                    word_lexicon_pad[1]
                    [0:self.max_lexicon_words_num])  # length

            lexicon_word_ids.append(lexicon_word_ids_sent)
            word_length_tensor.append(word_length_tensor_sent)

        lexicon_word_ids = list(
            map(
                lambda l: l + [[0] * self.max_lexicon_words_num] *
                (self.max_char_len - len(l)), lexicon_word_ids))
        word_length_tensor = list(
            map(
                lambda l: l + [[0] * self.max_lexicon_words_num] *
                (self.max_char_len - len(l)), word_length_tensor))

        if is_train:
            return lengths, chars_ids, lexicon_word_ids, word_length_tensor, target, labels

        return lengths, chars_ids, lexicon_word_ids, word_length_tensor

    def evaluate_line(
        self,
        sess,
        sentence,
    ):
        '''
        因LatticeLSTM内部参数受batch_size限制,数据会转为批处理
        :param sess: 会话
        :param sentence: 带处理文本
        :param self.data: 含词库等处理的数据集
        :return: 返回标注结果
        '''
        _, Ids = self.data.generate_sentence_instance_with_gaz(sentence)
        lengths, char_ids, lexicon_word_ids, word_length_tensor = self.batch_with_label(
            Ids, False)

        lengths = lengths * self.batch_size
        char_ids = char_ids * self.batch_size
        lexicon_word_ids = lexicon_word_ids * self.batch_size
        word_length_tensor = word_length_tensor * self.batch_size

        # run模型
        feed_dict = {
            self.model.placeholders["char_ids"]: char_ids,
            self.model.placeholders["lexicon_word_ids"]: lexicon_word_ids,
            self.model.placeholders["word_length_tensor"]: word_length_tensor,
        }

        logits = sess.run(self.model.logits, feed_dict=feed_dict)
        paths = self.decode(logits, lengths,
                            self.model.trans.eval(session=sess))
        tags = [self.data.label_alphabet.get_instance(idx) for idx in paths[0]]
        print("tags: ", tags)

        return tags

    def decode(self, logits, lengths, transition_matrix):
        """
        :param logits: [batch_size, num_steps, num_tags]float32, logits
        :param lengths: [batch_size]int32, real length of each sequence
        :param transition_matrix: transaction matrix for inference
        :return:
        """
        # inference final labels usa viterbi Algorithm
        paths = []
        small = -1000.0
        start = np.asarray([[small] * self.num_tags + [0]])
        for score, length in zip(logits, lengths):
            score = score[:length]
            pad = small * np.ones([length, 1])

            logits = np.concatenate([score, pad], axis=1)
            logits = np.concatenate([start, logits], axis=0)
            path, _ = tf.contrib.crf.viterbi_decode(logits, transition_matrix)

            paths.append(path[1:])

        return paths

    def load_data_and_embedding(self):
        self.data.HP_use_char = False
        self.data.HP_batch_size = 1
        self.data.use_bigram = False
        self.data.gaz_dropout = 0.5
        self.data.norm_gaz_emb = False
        self.data.HP_fix_gaz_emb = False

        self.data_initialization()

        self.data.generate_instance_with_gaz(self.train_file, 'train')
        self.data.generate_instance_with_gaz(self.dev_file, 'dev')
        self.data.generate_instance_with_gaz(self.test_file, 'test')

        self.data.build_word_pretrain_emb(self.char_emb)
        self.data.build_biword_pretrain_emb(self.bichar_emb)
        self.data.build_gaz_pretrain_emb(self.gaz_file)

    def data_initialization(self):
        self.data.build_alphabet(self.train_file)
        self.data.build_alphabet(self.dev_file)
        self.data.build_alphabet(self.test_file)

        self.data.build_gaz_file(self.gaz_file)

        self.data.build_gaz_alphabet(self.train_file)
        self.data.build_gaz_alphabet(self.dev_file)
        self.data.build_gaz_alphabet(self.test_file)
        self.data.fix_alphabet()
Пример #7
0
    char_file = args.char_emb
    word_file = args.word_emb

    if status == 'train':
        if os.path.exists(saved_set_path):
            print('Loading saved data set...')
            with open(saved_set_path, 'rb') as f:
                data = pickle.load(f)
        else:
            data = Data()
            data_initialization(data, word_file, train_file, dev_file, test_file)
            data.generate_instance_with_words(train_file, 'train')
            data.generate_instance_with_words(dev_file, 'dev')
            data.generate_instance_with_words(test_file, 'test')
            data.build_char_pretrain_emb(char_file)
            data.build_word_pretrain_emb(word_file)
            if saved_set_path is not None:
                print('Dumping data...')
                with open(saved_set_path, 'wb') as f:
                    pickle.dump(data, f)
        data.show_data_summary()
        args.word_alphabet_size = data.word_alphabet.size()
        args.char_alphabet_size = data.char_alphabet.size()
        args.label_alphabet_size = data.label_alphabet.size()
        args.char_dim = data.char_emb_dim
        args.word_dim = data.word_emb_dim
        print_args(args)
        train(data, args, saved_model_path)

    elif status == 'test':
        assert not (test_file is None)
Пример #8
0
# -*- coding: utf-8 -*-
# @Author: Jie
# @Date:   2017-06-15 14:11:08
# @Last Modified by:   Jie Yang,     Contact: [email protected]
# @Last Modified time: 2018-07-06 11:08:27
import time
import sys
import argparse
import random
import copy
import torch
import gc
import pickle as pickle
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from utils.metric import get_ner_fmeasure
from model.bilstmcrf import BiLSTM_CRF as SeqModel
from utils.data import Data
seed_num = 100
random.seed(seed_num)
torch.manual_seed(seed_num)
np.random.seed(seed_num)

def data_initialization(data, gaz_file, train_file, dev_file, test_file):
    data.build_alphabet(train_file)
    data.build_alphabet(dev_file)
    data.build_alphabet(test_file)
Пример #9
0
class Model_Lattice:
    def __init__(self, max_char_len, emb_size, max_lexicon_words_num,
                 num_units, num_tags, learning_rate):
        self.batch_size = 64
        self.max_char_len = max_char_len
        self.emb_size = emb_size
        self.max_lexicon_words_num = max_lexicon_words_num
        self.num_units = num_units
        self.num_tags = num_tags
        self.learning_rate = learning_rate
        self.optimizer = 'adam'
        self.clip = 5
        self.data = Data()
        self.data.build_word_pretrain_emb(
            'D:\\mygit\\NER_MODEL\\data\\data\\gigaword_chn.all.a2b.uni.ite50.vec'
        )
        self.data.build_gaz_pretrain_emb(
            'D:\\mygit\\NER_MODEL\\data\\data\\ctb.50d.vec')

        def my_filter_callable(tensor):
            # A filter that detects zero-valued scalars.
            return len(tensor.shape) == 0 and tensor == 0.0

        self.sess = tf_debug.LocalCLIDebugWrapperSession(tf.Session())
        self.sess.add_tensor_filter('my_filter', my_filter_callable)

        self.sess = tf.Session()
        self.placeholders = {}
        self.epoch = 0
        self.global_step = tf.Variable(0, trainable=False)

        self.char_ids = tf.placeholder(tf.int32, [None, self.max_char_len])
        self.lexicon_word_ids = tf.placeholder(
            tf.int32, [None, self.max_char_len, self.max_lexicon_words_num])
        self.word_length_tensor = tf.placeholder(
            tf.float32, [None, self.max_char_len, self.max_lexicon_words_num])
        self.labels = tf.placeholder(tf.int32, [None, self.max_char_len])

        self.lexicon_word_ids_reshape = tf.reshape(
            self.lexicon_word_ids,
            [-1, self.max_char_len * self.max_lexicon_words_num])
        self.seq_length = tf.convert_to_tensor(self.batch_size *
                                               [self.max_char_len],
                                               dtype=tf.int32)
        self.placeholders["char_ids"] = self.char_ids
        self.placeholders["lexicon_word_ids"] = self.lexicon_word_ids
        self.placeholders["word_length_tensor"] = self.word_length_tensor
        self.placeholders["labels"] = self.labels
        self.create_embedding()
        self.create_declare()
        self.create_model()
        self.create_loss()

    def create_embedding(self):
        self.char_embeddings = tf.Variable(self.data.pretrain_word_embedding,
                                           dtype=tf.float32,
                                           name="char_embeddings")
        self.word_embeddings = tf.Variable(self.data.pretrain_gaz_embedding,
                                           dtype=tf.float32,
                                           name="word_embeddings")

        self.char_embed = tf.nn.embedding_lookup(self.char_embeddings,
                                                 self.char_ids)
        self.lexicon_word_embed_reshape = tf.nn.embedding_lookup(
            self.word_embeddings, self.lexicon_word_ids_reshape)
        self.lexicon_word_embed = tf.reshape(
            self.lexicon_word_embed_reshape,
            [-1, self.max_char_len, self.max_lexicon_words_num, self.emb_size])

    def create_declare(self):
        # projection:
        self.W = tf.get_variable(
            "projection_w", [self.num_units, self.num_tags],
            initializer=tf.random_normal_initializer(stddev=0.1))
        self.b = tf.get_variable("projection_b", [self.num_tags])

    def create_model(self):
        lattice_lstm = LatticeLSTMCell(
            self.num_units,
            self.num_units,
            batch_size=self.batch_size,
            seq_len=self.max_char_len,
            max_lexicon_words_num=self.max_lexicon_words_num,
            word_length_tensor=self.word_length_tensor,
            dtype=tf.float32)

        initial_state = lattice_lstm.zero_state(batch_size=self.batch_size,
                                                dtype=tf.float32)
        outputs, state = tf.nn.dynamic_rnn(
            cell=lattice_lstm,
            inputs=[self.char_embed, self.lexicon_word_embed],
            initial_state=initial_state,
            dtype=tf.float32)
        x_reshape = tf.reshape(outputs, [-1, self.num_units])
        projection = tf.matmul(x_reshape, self.W) + self.b

        # -1 to timestep
        self.logits = tf.reshape(projection,
                                 [self.batch_size, -1, self.num_tags])

    def create_loss(self):
        self.loss = self.loss_layer(self.logits, self.seq_length, self.labels)
        with tf.variable_scope("optimizer"):
            self.opt = tf.train.AdamOptimizer(self.learning_rate)

            # apply grad clip to avoid gradient explosion
            grads_vars = self.opt.compute_gradients(self.loss)
            capped_grads_vars = map(
                lambda gv: gv if gv[0] is None else
                [tf.clip_by_value(gv[0], -self.clip, self.clip), gv[1]],
                grads_vars)
            self.train_op = self.opt.apply_gradients(capped_grads_vars,
                                                     self.global_step)

    def loss_layer(self, project_logits, lengths, labels, name=None):
        """ calculate crf loss
        :param project_logits: [batch_size, num_steps, num_tags]
        :param lengths: [batch_size, num_steps]
        :param labels: [batch_size, num_steps]
        :return: scalar loss
        """
        with tf.variable_scope("crf_loss" if not name else name):
            small = -1000.0
            # pad logits for crf loss
            start_logits = tf.concat([
                small * tf.ones(shape=[self.batch_size, 1, self.num_tags]),
                tf.zeros(shape=[self.batch_size, 1, 1])
            ],
                                     axis=-1)

            pad_logits = tf.cast(
                small * tf.ones([self.batch_size, self.max_char_len, 1]),
                tf.float32)
            logits = tf.concat([project_logits, pad_logits], axis=-1)
            logits = tf.concat([start_logits, logits], axis=1)

            targets = tf.concat([
                tf.cast(self.num_tags * tf.ones([self.batch_size, 1]),
                        tf.int32), labels
            ],
                                axis=-1)

            self.trans = tf.get_variable(
                "transitions",
                shape=[self.num_tags + 1, self.num_tags + 1],
                initializer=tf.random_uniform_initializer(0.008,
                                                          0.15,
                                                          seed=1311,
                                                          dtype=tf.float32))

            log_likelihood, self.trans = tf.contrib.crf.crf_log_likelihood(
                inputs=logits,
                tag_indices=targets,
                transition_params=self.trans,
                sequence_lengths=lengths + 1)

            return tf.reduce_sum(-log_likelihood)