def __init__(self): self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len # 全局使用,使其可以django、flask、tornado等调用 global graph graph = tf.get_default_graph() global model model = load_trained_model_from_checkpoint(self.config_path, self.checkpoint_path, seq_len=self.max_seq_len) model.summary(120) # 如果只选一层,就只取对应那一层的weight if len(layer_indexes) == 1: encoder_layer = model.get_layer(index=len(model.layers)-2).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: # layer_indexes must be [1,2,3,......12] all_layers = [model.get_layer(index=lay).output for lay in layer_indexes] encoder_layer = k_keras.concatenate(all_layers, -1) output_layer = NonMaskingLayer()(encoder_layer) model = Model(model.inputs, output_layer) # reader tokenizer self.token_dict = {} with codecs.open(self.dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.tokenizer = Tokenizer(self.token_dict)
def __init__(self): self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len # 全局使用,使其可以django、flask、tornado等调用 global graph graph = tf.get_default_graph() global model model = load_trained_model_from_checkpoint(self.config_path, self.checkpoint_path, seq_len=self.max_seq_len) print(model.output) print(len(model.layers)) # lay = model.layers #一共104个layer,其中前八层包括token,pos,embed等, # 每4层(MultiHeadAttention,Dropout,Add,LayerNormalization) # 一共24层 layer_dict = [] layer_0 = 7 for i in range(12): layer_0 = layer_0 + 8 layer_dict.append(layer_0) # 输出它本身 if len(layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层输出 elif len(layer_indexes) == 1: if layer_indexes[0] in [i + 1 for i in range(23)]: encoder_layer = model.get_layer( index=layer_dict[layer_indexes[0]]).output else: encoder_layer = model.get_layer(index=layer_dict[-1]).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: # layer_indexes must be [1,2,3,......12] # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] all_layers = [ model.get_layer(index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range(23)] else model.get_layer( index=layer_dict[-1]).output #如果给出不正确,就默认输出最后一层 for lay in layer_indexes ] print(layer_indexes) print(all_layers) all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) print(encoder_layer.shape) print("KerasBertEmbedding:") print(encoder_layer.shape) output_layer = NonMaskingLayer()(encoder_layer) model = Model(model.inputs, output_layer) # model.summary(120) # reader tokenizer self.token_dict = {} with codecs.open(self.dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.tokenizer = Tokenizer(self.token_dict)
def make_BERT(self): # Location of BERT model description = self._get_description('bert-' + self.lang) if description is not None: self.lang = description["lang"] config_file = description["path-config"] weight_file = description["path-weights"] vocab_file = description["path-vocab"] print('init BERT') # load the pretrained model with self.graph.as_default(): # with self.session.as_default(): #with tf.variable_scope('', reuse=tf.AUTO_REUSE): self.bert_model = load_trained_model_from_checkpoint( config_file, weight_file) self.bert_model.summary(line_length=120) self.bert_model._make_predict_function() # init the tokenizer token_dict = {} with codecs.open(vocab_file, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) print('token_dict size:', len(token_dict)) self.bert_tokenizer = Tokenizer(token_dict, cased=True)
def init_bert(self,config): bert_config = config['bert']['config_json'] bert_checkpoint = config['bert']['bert_ckpt'] bert_vocab = config['bert']['bert_vocab'] bert_vocabs = load_vocabulary(bert_vocab) self.bert_token = Tokenizer(bert_vocabs) self.bert = self.load_bert(bert_config, bert_checkpoint)
def test_uncased(self): tokens = [ '[PAD]', '[UNK]', '[CLS]', '[SEP]', 'want', '##want', '##ed', 'wa', 'un', 'runn', '##ing', ',', '\u535A', '\u63A8', ] token_dict = {token: i for i, token in enumerate(tokens)} tokenizer = Tokenizer(token_dict) text = u"UNwant\u00E9d, running \nah\u535A\u63A8zzz\u00AD" tokens = tokenizer.tokenize(text) expected = [ '[CLS]', 'un', '##want', '##ed', ',', 'runn', '##ing', 'a', '##h', '\u535A', '\u63A8', 'z', '##z', '##z', '[SEP]', ] self.assertEqual(expected, tokens) indices, segments = tokenizer.encode(text) expected = [2, 8, 5, 6, 11, 9, 10, 1, 1, 12, 13, 1, 1, 1, 3] self.assertEqual(expected, indices) expected = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.assertEqual(expected, segments) decoded = tokenizer.decode(indices) expected = [ 'un', '##want', '##ed', ',', 'runn', '##ing', '[UNK]', '[UNK]', '\u535A', '\u63A8', '[UNK]', '[UNK]', '[UNK]', ] self.assertEqual(expected, decoded)
def make_BERT(self): # Location of BERT model description = self.get_description('bert-base-' + self.lang) if description is not None: self.lang = description["lang"] config_file = description["path-config"] weight_file = description["path-weights"] vocab_file = description["path-vocab"] print('init BERT') # load the pretrained model with self.graph.as_default(): # there are different typical pooling strategies for getting BERT features: # - concatenation of 4 last layers (the one from the original BERT paper, BERT_embed_size is then 3072) # - last layer (BERT_embed_size is 768) # - average of 4 last layers (BERT_embed_size is 768) # - sum of the 4 last layers (BERT_embed_size is 768) self.bert_model = load_trained_model_from_checkpoint( config_file, weight_file, output_layer_num=4) self.bert_model.summary(line_length=120) self.bert_model._make_predict_function() # init the tokenizer token_dict = {} with codecs.open(vocab_file, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) print('token_dict size:', len(token_dict)) self.bert_tokenizer = Tokenizer(token_dict, cased=True)
def _text_process(self, text): Tokener = Tokenizer(self.vocab_dict) encoder = [Tokener.encode(first=doc[0],second=doc[1], max_len=self.max_seq_len) for doc in text] input_ids = [i[0] for i in encoder] input_type = [i[1] for i in encoder] input_mask = [[0 if l==0 else 1 for l in i] for i in input_ids] return (input_ids,input_mask,input_type)
def load_bert_data(raw_file, train=True): config = Config() dict_path = './corpus/vocab.txt' token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) tags2id, id2tags = load_tags('tags.txt') x_ids = list() x_segments = list() x_label = list() with codecs.open(raw_file, encoding='utf-8') as f: for line in f: x = json.loads(line) input_sent = x['title'] tokenizer = Tokenizer(token_dict) x_sent_id, x_sent_segment = tokenizer.encode( input_sent, max_len=config.max_len_word) x_ids.append(x_sent_id) x_segments.append(x_sent_segment) if train: y = load_label(x, tags2id) x_label.append(y) x_label = np.asarray(x_label) return x_ids, x_segments, x_label, id2tags, None
def __init__(self, with_bert=True, fine_tune=True, language_backbone='chinese_wwm_base', batch_size=16): """ :param with_bert: :param fine_tune: :param model_arch: pointer, seq2seq or transformer """ self.language_backbone = language_backbone self.with_bert = with_bert self.fine_tune = fine_tune self.paths = download_pretrained_bert(language_backbone) self.bs = batch_size token_dict = {} with codecs.open(self.paths.vocab, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) self.tokenizer = Tokenizer(token_dict) self.dataloader = Dataloader_v1(tokenizer=self.tokenizer, batch_size=self.bs, split_rate=0.1)
def __init__(self, label, bs = 16, token_dict = None): self.batch_size = bs self.random = random self.ans = label self.maxlen_doc = 512 self.tokenizer = Tokenizer(token_dict) self.iter_index = np.arange(len(self.ans))
def load_task2_testX(dict_path, data_dir): if not os.path.exists(os.path.join( data_dir, 'task2_testX.npy')) or not os.path.exists( os.path.join(data_dir, 'task2_test_seg.npy')): df = pd.read_csv(os.path.join(data_dir, 'task2_public_testset.csv'), dtype=str) abstract = df.values[:, 2] # collect words token_dict = load_vocabulary(dict_path) tokenizer = Tokenizer(token_dict) input_data = [] input_seg = [] seq_len = 512 # maximum should be 638, while bert-BASE only support up to 512 for i in tqdm(abstract): j = i.replace('$$$', ' ') idx, seg = tokenizer.encode(j, max_len=seq_len) input_data.append(idx) input_seg.append(seg) X = np.asarray(input_data) seg = np.asarray(input_seg) np.save(os.path.join(data_dir, 'task2_testX.npy'), X) np.save(os.path.join(data_dir, 'task2_test_seg.npy'), seg) else: X, seg = np.load(os.path.join(data_dir, 'task2_testX.npy')), np.load( os.path.join(data_dir, 'task2_test_seg.npy')) return X, seg
def work1(self, text1): out = [] if type(text1) == str: text1 = [text1] for i in text1: resu = i.replace('|', '').replace(' ', '').replace('“', '“').replace('”', '”') \ .replace('‘', '‘').replace('’', '’').replace('〔', '(').replace('〕', ')').replace('/', '') \ .replace('·', '·').replace('•', '·').replace("\\n", "\n").replace("\\r", "\r").replace("\\t", "\t") resu = re.split(r'\s+', resu) dr = re.compile(r'<[^>]+>', re.S) dd = dr.sub('', '。'.join(resu)) line = re.sub(self.restr, '', dd) eng = [",", "!", "?", ":", ";", "(", ")", "[", "]", "$", "。。"] chi = [",", "!", "?", ":", ";", "(", ")", "【", "】", "¥", '。'] for i, j in zip(eng, chi): line = line.replace(i, j) out.append(line[:28]) token_dict = {} dict_path = "../chinese_L-12_H-768_A-12/vocab.txt" with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) tokenizer = Tokenizer(token_dict) x1, x2 = [], [] for text in out: indices, segments = tokenizer.encode(first=text, max_len=512) x1.append(indices) x2.append(segments) return x1, x2
def __init__(self, ): #分词词典加载 with cs.open('../data/segment_dic.txt', 'r', 'utf-8') as fp: segment_dic = {} for line in fp: if line.strip(): segment_dic[line.strip()] = 0 self.segment_dic = segment_dic self.max_seq_len = 20 begin = time.time() jieba.load_userdict('../data/segment_dic.txt') print('加载用户分词词典时间为:%.2f' % (time.time() - begin)) #加载训练好的实体识别模型 custom_objects = get_custom_objects() self.ner_model = load_model('../data/model/ner_model.h5', custom_objects=custom_objects) #加载bert tokenlizer dict_path = '../../news_classifer_task/wwm/vocab.txt' token_dict = {} with cs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) self.tokenizer = Tokenizer(token_dict) print('mention extractor loaded')
def test_empty(self): tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]'] token_dict = {token: i for i, token in enumerate(tokens)} tokenizer = Tokenizer(token_dict) text = u'' self.assertEqual(['[CLS]', '[SEP]'], tokenizer.tokenize(text)) indices, segments = tokenizer.encode(text) self.assertEqual([2, 3], indices) self.assertEqual([0, 0], segments)
def init_all(self, config): if self.train: bert_config = config['bert']['config_json'] bert_checkpoint = config['bert']['bert_ckpt'] bert_vocab = config['bert']['bert_vocab'] bert_vocabs = load_vocabulary(bert_vocab) self.bert_token = Tokenizer(bert_vocabs) self.bert = self.init_bert(bert_config, bert_checkpoint) self.get_sentence(config['train_list'] if self.train else config['eval_list'], training=self.train)
def PreProcessInputData(self, text): tokenizer = Tokenizer(self.vocab) word_labels = [] seq_types = [] for sequence in text: code = tokenizer.encode(first=sequence, max_len=self.max_seq_length) word_labels.append(code[0]) seq_types.append(code[1]) return word_labels, seq_types
def article_preprocess(self): tokenizer = Tokenizer(self.token_dict) self.text_split = [ele for ele in self.text.split('。') if len(ele) > 0] self.sent_num = len(self.text_split) tok = [tokenizer.encode(sent)[0] for sent in self.text_split] tok_pad = pad_sequences(tok, maxlen=self.seqence_len) self.data_in = [ tok_pad, np.zeros(shape=(self.sent_num, self.seqence_len)) ]
def tokenizer_init(self): """字典""" # reader tokenizer token2idx = {} path_dict = os.path.join(self.path_dir, "vocab.txt") with codecs.open(path_dict, 'r', 'utf8') as reader: for line in reader: token = line.strip() token2idx[token] = len(token2idx) self.tokenizer = Tokenizer(token2idx)
def tokenizer_init(self): pretrained_path = 'uncased_L-12_H-768_A-12' config_path = os.path.join(pretrained_path, 'bert_config.json') model_path = os.path.join(pretrained_path, 'bert_model.ckpt') vocab_path = os.path.join(pretrained_path, 'vocab.txt') token_dict = load_vocabulary(vocab_path) print("Total vocabulary loaded: {}".format(len(token_dict))) self.tokenizer = Tokenizer(token_dict)
def load_data(texts): tokenizer = Tokenizer(token_dict) indices = [] indices_mask = [] for text in tqdm(texts): ids,masked_ids = tokenizer.encode(text[0],text[1],max_len=SEQ_LEN) indices.append(ids) indices_mask.append(masked_ids) indices = np.array(indices) indices_mask = np.array(indices_mask) return [indices, indices_mask]
class twitterProcessor(): def __init__(self, vocab_path, data_dir, SEQ_LEN): self.vocab_path = vocab_path self.data_dir = data_dir self.seq_len = SEQ_LEN def get_train_examples(self, data_dir): token_dict = {} with codecs.open(self.vocab_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) self.tokenizer = Tokenizer(token_dict) with open(data_dir, 'r', encoding='utf-8') as f: reader = f.readlines() x_train, y_train = self.create_examples(reader, "train") return x_train, y_train def create_examples(self, lines, set_type): examples = [] indices, labels = [], [] for index, line in enumerate(lines): guid = "%s-%s" % (set_type, index) split_line = line.strip().split('+++$+++') ids, segments = self.tokenizer.encode(split_line[1], max_len=self.seq_len) sentiment = split_line[0] indices.append(ids) labels.append(sentiment) return [indices, np.zeros_like(indices)], np.array(labels) def get_test_examples(self, data_dir): token_dict = {} with codecs.open(self.vocab_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) self.tokenizer = Tokenizer(token_dict) with open(data_dir, 'r', encoding='utf-8') as f: reader = f.readlines() x_test = self.create_test_examples(reader, "train") return x_test def create_test_examples(self, lines, set_type): examples = [] indices = [] for index, line in enumerate(lines): guid = "%s-%s" % (set_type, index) ids, segments = self.tokenizer.encode(line.strip(), max_len=self.seq_len) indices.append(ids) return [indices, np.zeros_like(indices)]
def encode_input_x(self, sentences): '''数据X序列化编码 使用BERT的Tokenizer:Token编码, 句子编码 sentences是句子列表,字符串''' tokenizer = Tokenizer(self.vocab) sent_token_ids = [] sent_segment_ids = [] for sequence in sentences: token_ids, segment_ids = tokenizer.encode( first=sequence, max_len=self.seq_maxlen) # 输入只有1个句子! sent_token_ids.append(token_ids) sent_segment_ids.append(segment_ids) return [sent_token_ids, sent_segment_ids]
def __init__(self, docs, vec): self.texts = np.array(docs) self.vec = vec paths = get_checkpoint_paths(".") inputs = load_trained_model_from_checkpoint( config_file=paths.config, checkpoint_file=paths.checkpoint, seq_len=50) outputs = MaskedGlobalMaxPool1D(name='Pooling')(inputs.output) self.model = Model(inputs=inputs.inputs, outputs=outputs) self.vocab = load_vocabulary(paths.vocab) self.tokenizer = Tokenizer(self.vocab)
def get_test_examples(self, data_dir): token_dict = {} with codecs.open(self.vocab_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) self.tokenizer = Tokenizer(token_dict) with open(data_dir, 'r', encoding='utf-8') as f: reader = f.readlines() x_test = self.create_test_examples(reader, "train") return x_test
def __init__(self, vocab_path, label_file): #load vocab self.bert_vocab_dict = {} self.load_bert_vocab(vocab_path, self.bert_vocab_dict) self.tokenizer = Tokenizer(self.bert_vocab_dict) #load label self.label_2_index = {} self.label_table_size = 0 self.load_label_vocab(label_file, self.label_2_index) self.label_table_size = len(self.label_2_index)
def __init__(self): self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len # 全局使用,使其可以django、flask、tornado等调用 global graph graph = tf.compat.v1.get_default_graph() global model model = load_trained_model_from_checkpoint(self.config_path, self.checkpoint_path, seq_len=self.max_seq_len) layer_dict = [7] layer_0 = 7 for i in range(12): layer_0 = layer_0 + 8 layer_dict.append(layer_0) # 输出它本身 if len(layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,就只取最后那一层的weight,取得不正确 elif len(layer_indexes) == 1: if layer_indexes[0] in [i + 1 for i in range(13)]: encoder_layer = model.get_layer( index=layer_dict[layer_indexes[0]]).output else: encoder_layer = model.get_layer(index=layer_dict[-1]).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: # layer_indexes must be [1,2,3,......13] # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] all_layers = [ model.get_layer(index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range(13)] else model.get_layer( index=layer_dict[-1]).output # 如果给出不正确,就默认输出最后一层 for lay in layer_indexes ] # 其中layer==1的output是格式不对,第二层输入input是list all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) output_layer = NonMaskingLayer()(encoder_layer) model = Model(model.inputs, output_layer) # model.summary(120) # reader tokenizer self.token_dict = {} with codecs.open(self.dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.tokenizer = Tokenizer(self.token_dict) print("完成词向量生成器初始化!")
def get_infer_input(input_file, out_file): id_type = pd.read_pickle('../data/id_type.pkl') type_index = pd.read_pickle('../data/type_index.pkl') entity_id = pd.read_pickle('../data/entity_id.pkl') id_text = pd.read_pickle('../data/id_text.pkl') token_dict = get_token_dict() tokenizer = Tokenizer(token_dict) out_file = open(out_file, 'w') file_index = 0 with open(input_file) as f: for line in f: if file_index % 100 == 0: print(file_index) file_index += 1 temDict = json.loads(line) text = temDict['text'] mention_data = temDict['mention_data'] for men in mention_data: mention = men['mention'] offset = int(men['offset']) begin = int(offset) + 1 end = begin + len(mention) link_id = get_link_entity_test(mention, entity_id) men['link_id'] = link_id link_data = { 'ids': [], 'seg': [], 'begin': [], 'end': [], 'en_type': [] } for id in link_id: kb_text = id_text[id] kb_type = type_index[id_type[id][0]] indice, segment = tokenizer.encode(first=text, second=kb_text, max_len=256) link_data['ids'].append(indice) link_data['seg'].append(segment) link_data['begin'].append([begin]) link_data['end'].append([end]) link_data['en_type'].append([kb_type]) men['link_data'] = link_data out_file.write(json.dumps(temDict, ensure_ascii=False)) out_file.write('\n')
def bert_sen_token(token_dict, traininstance, maxlen): tokenizer = Tokenizer(token_dict) train_indices = [] train_segments = [] train_text = [] for text in traininstance: tokens = tokenizer.tokenize(text) indices, segments = tokenizer.encode(first=text, max_len=maxlen) train_indices.append(indices) train_segments.append(segments) train_text.append(tokens) return train_indices, train_segments, train_text
def test_padding(self): tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]'] token_dict = {token: i for i, token in enumerate(tokens)} tokenizer = Tokenizer(token_dict) text = '\u535A\u63A8' # single indices, segments = tokenizer.encode(first=text, max_len=100) expected = [2, 1, 1, 3] + [0] * 96 self.assertEqual(expected, indices) expected = [0] * 100 self.assertEqual(expected, segments) decoded = tokenizer.decode(indices) self.assertEqual(['[UNK]', '[UNK]'], decoded) indices, segments = tokenizer.encode(first=text, max_len=3) self.assertEqual([2, 1, 3], indices) self.assertEqual([0, 0, 0], segments) # paired indices, segments = tokenizer.encode(first=text, second=text, max_len=100) expected = [2, 1, 1, 3, 1, 1, 3] + [0] * 93 self.assertEqual(expected, indices) expected = [0, 0, 0, 0, 1, 1, 1] + [0] * 93 self.assertEqual(expected, segments) decoded = tokenizer.decode(indices) self.assertEqual((['[UNK]', '[UNK]'], ['[UNK]', '[UNK]']), decoded) indices, segments = tokenizer.encode(first=text, second=text, max_len=4) self.assertEqual([2, 1, 3, 3], indices) self.assertEqual([0, 0, 0, 1], segments)
def encode(text): vocabs = set() with open(dict_path, encoding='utf8') as f: for l in f: vocabs.add(l.replace('\n', '')) token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) tokenizer = Tokenizer(token_dict) tokens = ['[CLS]'] + [ch if ch in vocabs else '[UNK]' for ch in text] + ['[SEP]'] return tokenizer._convert_tokens_to_ids(tokens), [0] * len(tokens)