Exemplo n.º 1
0
    def __init__(self, **kwargs):
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=kwargs['vocab_file'],
            do_lower_case=True)
        self.max_seq_len = 70
        self.ckpt_path = kwargs['model_dir']
        self.init_checkpoint = kwargs['init_checkpoint_file']
        self.bert_config = kwargs['bert_config_dir']

        self.graph = kwargs["graph"]
        with self.graph.as_default():
            self.model = Model(init_checkpoint_file=self.init_checkpoint, bert_config_dir=self.bert_config)
            self.saver = tf.train.Saver()
        config = tf.ConfigProto(log_device_placement=False)
        self.session = tf.Session(graph=self.graph, config=config)
        self.load()
Exemplo n.º 2
0
    def __init__(self):
        self.lstm_dim = 128
        self.batch_size = 1
        self.max_seq_len = 70
        self.clip = 5.0
        self.dropout_keep = 0.5
        self.optimizer = 'adam'
        self.lr = 0.001
        self.tag_schema = 'iob'
        self.ckpt_path = '..\\models'
        self.steps_check = 10
        self.zeros = False
        self.lower = True
        self.max_epoch = 2
        self.num_tags = len(convert_samples.tag_to_id)
        self.model = Model(init_checkpoint_file='D:\models\\albert_base_zh\\albert_model.ckpt'
                           , bert_config_dir='D:\models\\albert_base_zh\\albert_config_base.json')
        self.saver = tf.train.Saver()

        self.tokenizer = tokenization.FullTokenizer(vocab_file='D:\models\\albert_base_zh\\vocab.txt',
                                                    do_lower_case=True)
Exemplo n.º 3
0
    def __init__(self, **kwargs):
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=kwargs['vocab_file'],
            # vocab_file='D:\\mygit\\NER_MODEL\\albert_tiny_489k\\vocab.txt',
            do_lower_case=True)
        self.max_seq_len = 70
        self.ckpt_path = kwargs['model_dir']
        # self.ckpt_path = 'D:\\mygit\\NER_MODEL\\models'
        self.init_checkpoint = kwargs['init_checkpoint_file']
        # self.init_checkpoint = 'D:\\mygit\\NER_MODEL\\albert_tiny_489k\\albert_model.ckpt'
        self.bert_config = kwargs['bert_config_dir']
        # self.bert_config = 'D:\\mygit\\NER_MODEL\\albert_tiny_489k\\albert_config_tiny.json'

        self.graph = kwargs["graph"]
        with self.graph.as_default():
            self.model = Model(init_checkpoint_file=self.init_checkpoint,
                               bert_config_dir=self.bert_config)
            self.saver = tf.train.Saver()
        config = tf.ConfigProto(log_device_placement=False,
                                gpu_options=gpu_options)
        self.session = tf.Session(graph=self.graph, config=config)
        self.load()
Exemplo n.º 4
0
class modelTrain:
    def __init__(self):
        self.lstm_dim = 128
        self.batch_size = 8
        self.max_seq_len = 70
        self.clip = 5.0
        self.dropout_keep = 0.5
        self.optimizer = 'adam'
        self.lr = 0.001
        self.tag_schema = 'iob'
        self.ckpt_path = '..\\models'
        self.steps_check = 10
        self.zeros = False
        self.lower = True
        self.max_epoch = 2
        self.num_tags = len(convert_samples.tag_to_id)
        self.model = Model(
            init_checkpoint_file='D:\迅雷下载\\albert_tiny_489k\\albert_model.ckpt',
            bert_config_dir='D:\迅雷下载\\albert_tiny_489k\\albert_config_tiny.json'
        )
        self.saver = tf.train.Saver()

        self.tokenizer = tokenization.FullTokenizer(
            vocab_file='D:\迅雷下载\\albert_tiny_489k\\vocab.txt',
            do_lower_case=True)

    def train(self):
        path = '..\data\\train.json'
        train_sentences = self.load_sentences(path)
        train_data = self.prepare_dataset(train_sentences, self.max_seq_len,
                                          self.lower)
        train_manager = BatchManager(train_data, self.batch_size)
        init = tf.global_variables_initializer()
        steps_per_epoch = train_manager.len_data
        with tf.Session() as sess:
            loss = []
            sess.run(init)
            for i in range(self.max_epoch):
                for batch in train_manager.iter_batch(shuffle=True):
                    step, batch_loss = self.model.run_step(sess, True, batch)
                    loss.append(batch_loss)
                    if step % self.steps_check == 0:
                        iteration = step // steps_per_epoch + 1
                        print("iteration:{} step:{}/{}, "
                              "NER loss:{:>9.6f}".format(
                                  iteration, step % steps_per_epoch,
                                  steps_per_epoch, np.mean(loss)))
                        loss = []
                    self.save_model(sess,
                                    self.model,
                                    self.ckpt_path,
                                    global_steps=step)

    def load_sentences(self, path):
        """
        Load sentences. A line must contain at least a word and its tag.
        Sentences are separated by empty lines.
        """
        sentences = []
        num = 0
        for j, line in enumerate(codecs.open(path, 'r', 'utf8')):
            sentence = []
            num += 1
            data = json.loads(line)
            list_lable = str(data['label']).split(' ')
            for i, value in enumerate(list(data['text'])):
                temp = []
                temp.append(value)
                temp.append(list_lable[i])
                sentence.append(temp)
            sentences.append(sentence)
        return sentences

    def save_model(self, sess, model, path, global_steps):
        checkpoint_path = os.path.join(path, "ner.ckpt")
        model.saver.save(sess, checkpoint_path, global_step=global_steps)

    def prepare_dataset(self,
                        sentences,
                        max_seq_length,
                        lower=False,
                        train=True):
        """
        Prepare the dataset. Return a list of lists of dictionaries containing:
            - word indexes
            - word char indexes
            - tag indexes
        """
        data = []
        for s in sentences:
            if lower:
                string = [w[0].strip().lower() for w in s]
            else:
                string = [w[0].strip() for w in s]
            char_line = ' '.join(string)
            text = tokenization.convert_to_unicode(char_line)

            if train:
                tags = [w[-1] for w in s]
            else:
                tags = ['O' for _ in string]

            labels = ' '.join(tags)
            labels = tokenization.convert_to_unicode(labels)

            ids, mask, segment_ids, label_ids = self.convert_single_example(
                char_line=text,
                max_seq_length=max_seq_length,
                tokenizer=self.tokenizer,
                label_line=labels)
            data.append([string, segment_ids, ids, mask, label_ids])

        return data

    def convert_single_example(self, char_line, max_seq_length, tokenizer,
                               label_line):
        """
        将一个样本进行分析,然后将字转化为id, 标签转化为lb
        """
        text_list = char_line.split(' ')
        label_list = label_line.split(' ')

        tokens = []
        labels = []
        for i, word in enumerate(text_list):
            token = tokenizer.tokenize(word)
            tokens.extend(token)
            label_1 = label_list[i]
            for m in range(len(token)):
                if m == 0:
                    labels.append(label_1)
                else:
                    labels.append("X")
        # 序列截断
        if len(tokens) >= max_seq_length - 1:
            tokens = tokens[0:(max_seq_length - 2)]
            labels = labels[0:(max_seq_length - 2)]
        ntokens = []
        segment_ids = []
        label_ids = []
        ntokens.append("[CLS]")
        segment_ids.append(0)
        # append("O") or append("[CLS]") not sure!
        label_ids.append(convert_samples.tag_to_id["[CLS]"])
        for i, token in enumerate(tokens):
            ntokens.append(token)
            segment_ids.append(0)
            label_ids.append(convert_samples.tag_to_id[labels[i]])
        ntokens.append("[SEP]")
        segment_ids.append(0)
        # append("O") or append("[SEP]") not sure!
        label_ids.append(convert_samples.tag_to_id["[SEP]"])
        input_ids = tokenizer.convert_tokens_to_ids(ntokens)
        input_mask = [1] * len(input_ids)

        # padding
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            # we don't concerned about it!
            label_ids.append(0)
            ntokens.append("**NULL**")

        return input_ids, input_mask, segment_ids, label_ids
Exemplo n.º 5
0
class bert_predict:
    def __init__(self, **kwargs):
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=kwargs['vocab_file'],
            # vocab_file='D:\\mygit\\NER_MODEL\\albert_tiny_489k\\vocab.txt',
            do_lower_case=True)
        self.max_seq_len = 70
        self.ckpt_path = kwargs['model_dir']
        # self.ckpt_path = 'D:\\mygit\\NER_MODEL\\models'
        self.init_checkpoint = kwargs['init_checkpoint_file']
        # self.init_checkpoint = 'D:\\mygit\\NER_MODEL\\albert_tiny_489k\\albert_model.ckpt'
        self.bert_config = kwargs['bert_config_dir']
        # self.bert_config = 'D:\\mygit\\NER_MODEL\\albert_tiny_489k\\albert_config_tiny.json'

        self.graph = kwargs["graph"]
        with self.graph.as_default():
            self.model = Model(init_checkpoint_file=self.init_checkpoint,
                               bert_config_dir=self.bert_config)
            self.saver = tf.train.Saver()
        config = tf.ConfigProto(log_device_placement=False,
                                gpu_options=gpu_options)
        self.session = tf.Session(graph=self.graph, config=config)
        self.load()

    def load(self):
        ckpt = tf.train.get_checkpoint_state(self.ckpt_path)
        if ckpt is not None and ckpt.model_checkpoint_path:
            self.saver.restore(self.session, ckpt.model_checkpoint_path)
        else:
            raise Exception("load model failure...")

    def predict_batch(self, input_text):
        train_sentences = self.load_samples(input_text)
        train_data = convert_samples.prepare_dataset(train_sentences,
                                                     self.max_seq_len, True)
        train_manager = BatchManager(train_data, len(input_text))
        temp = []

        results = self.model.evaluate(self.session, train_manager,
                                      convert_samples.id_to_tag)
        for i, v in enumerate(results):
            a = bio_to_json(input_text[i], v[1:-1])
            temp.append(a)
        data_items = {}
        s = []
        for j in temp:
            if len(j['entities']) > 0:
                for k in range(len(j['entities'])):
                    value = j['entities'][k]['word']
                    data_items[j['entities'][k]['type']] = [value]
            s.append(data_items)
        return s

    def load_samples(self, datas):
        """
        Load sentences. A line must contain at least a word and its tag.
        Sentences are separated by empty lines.
        """
        sentences = []
        num = 0
        for j, data in enumerate(datas):
            sentence = []
            num += 1
            list_lable = list((len(data)) * 'O')
            for i, value in enumerate(data):
                temp = []
                temp.append(value)
                temp.append(list_lable[i])
                sentence.append(temp)
            sentences.append(sentence)
        return sentences