def test_uncased(self): tokens = [ '[PAD]', '[UNK]', '[CLS]', '[SEP]', 'want', '##want', '##ed', 'wa', 'un', 'runn', '##ing', ',', '\u535A', '\u63A8', ] token_dict = {token: i for i, token in enumerate(tokens)} tokenizer = Tokenizer(token_dict) text = u"UNwant\u00E9d, running \nah\u535A\u63A8zzz\u00AD" tokens = tokenizer.tokenize(text) expected = [ '[CLS]', 'un', '##want', '##ed', ',', 'runn', '##ing', 'a', '##h', '\u535A', '\u63A8', 'z', '##z', '##z', '[SEP]', ] self.assertEqual(expected, tokens) indices, segments = tokenizer.encode(text) expected = [2, 8, 5, 6, 11, 9, 10, 1, 1, 12, 13, 1, 1, 1, 3] self.assertEqual(expected, indices) expected = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.assertEqual(expected, segments) decoded = tokenizer.decode(indices) expected = [ 'un', '##want', '##ed', ',', 'runn', '##ing', '[UNK]', '[UNK]', '\u535A', '\u63A8', '[UNK]', '[UNK]', '[UNK]', ] self.assertEqual(expected, decoded)
def test_empty(self): tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]'] token_dict = {token: i for i, token in enumerate(tokens)} tokenizer = Tokenizer(token_dict) text = u'' self.assertEqual(['[CLS]', '[SEP]'], tokenizer.tokenize(text)) indices, segments = tokenizer.encode(text) self.assertEqual([2, 3], indices) self.assertEqual([0, 0], segments)
def bert_sen_token(token_dict, traininstance, maxlen): tokenizer = Tokenizer(token_dict) train_indices = [] train_segments = [] train_text = [] for text in traininstance: tokens = tokenizer.tokenize(text) indices, segments = tokenizer.encode(first=text, max_len=maxlen) train_indices.append(indices) train_segments.append(segments) train_text.append(tokens) return train_indices, train_segments, train_text
def test_cased(self): tokens = [ '[UNK]', u'[CLS]', '[SEP]', 'want', '##want', u'##\u00E9d', 'wa', 'UN', 'runn', '##ing', ',', ] token_dict = {token: i for i, token in enumerate(tokens)} tokenizer = Tokenizer(token_dict, cased=True) text = u"UNwant\u00E9d, running" tokens = tokenizer.tokenize(text) expected = ['[CLS]', 'UN', '##want', u'##\u00E9d', ',', 'runn', '##ing', '[SEP]'] self.assertEqual(expected, tokens) indices, segments = tokenizer.encode(text) expected = [1, 7, 4, 5, 10, 8, 9, 2] self.assertEqual(expected, indices) expected = [0, 0, 0, 0, 0, 0, 0, 0] self.assertEqual(expected, segments)
def tokenize(char_seqs, vocab, cased): from keras_bert import Tokenizer, TOKEN_CLS, TOKEN_SEP tokenizer = Tokenizer(vocab, cased=cased) token_seqs = [] orig2token_maps = [] for char_seq in char_seqs: orig2token_map = [0] token_seq = [TOKEN_CLS] for c in char_seq: orig2token_map.append(len(token_seq)) tokens = tokenizer.tokenize(c) tokens = tokens[1:-1] token_seq.extend(tokens) orig2token_map.append(len(token_seq)) token_seq.append(TOKEN_SEP) orig2token_maps.append(orig2token_map) token_seqs.append(token_seq) return token_seqs, orig2token_maps
class SearchBERT(): def __init__(self, docs, vec): self.texts = np.array(docs) self.vec = vec paths = get_checkpoint_paths(".") inputs = load_trained_model_from_checkpoint( config_file=paths.config, checkpoint_file=paths.checkpoint, seq_len=50) outputs = MaskedGlobalMaxPool1D(name='Pooling')(inputs.output) self.model = Model(inputs=inputs.inputs, outputs=outputs) self.vocab = load_vocabulary(paths.vocab) self.tokenizer = Tokenizer(self.vocab) def search(self, query, n=5): tokens = self.tokenizer.tokenize(" ".join(lemmatize(query)))[:50] indices = [self.vocab[token] for token in tokens] + \ [0 for i in range(50 - len(tokens))] segments = [0 for i in range(50)] query_vec = self.model.predict( [np.array([indices]), np.array([segments])])[0] result = np.matmul(self.vec, query_vec) idxs = np.argsort(result)[::-1].tolist()[:n] return list(zip(self.texts[idxs], result[idxs]))
class BertNerBiLstmModel(): def __init__(self): # logger.info("BertBiLstmModel init start!") print("BertNerBiLstmModel init start!") self.dict_path, self.max_seq_len, self.keep_prob, self.is_training = vocab_file, args.max_seq_len, args.keep_prob, args.is_training # reader tokenizer self.token_dict = {} with codecs.open(self.dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.tokenizer = Tokenizer(self.token_dict) # 你可以选择一个model build,有bi-lstm single、bi-lstm 3-layers、bi-lstm_attention self.build_model_bilstm_layers() self.compile_model() # self.build_model_bilstm_single() # logger.info("BertBiLstmModel init end!") print("BertNerBiLstmModel init end!") def process_single(self, texts): # 文本预处理,传入一个list,返回的是ids\mask\type-ids input_ids = [] input_masks = [] input_type_ids = [] for text in texts: if type(text) is list: text = "".join(text) logger.info(text) tokens_text = self.tokenizer.tokenize(text) logger.info('Tokens:', tokens_text) input_id, input_type_id = self.tokenizer.encode(first=text, max_len=self.max_seq_len) input_mask = [0 if ids == 0 else 1 for ids in input_id] input_ids.append(input_id) input_type_ids.append(input_type_id) input_masks.append(input_mask) # numpy处理list input_ids = np.array(input_ids) input_masks = np.array(input_masks) input_type_ids = np.array(input_type_ids) logger.info("process ok!") return [input_ids, input_masks, input_type_ids] def process_pair(self, textss): # 文本预处理,传入一个list,返回的是ids\mask\type-ids input_ids = [] input_masks = [] input_type_ids = [] for texts in textss: tokens_text = self.tokenizer.tokenize(texts[0]) logger.info('Tokens1:', tokens_text) tokens_text2 = self.tokenizer.tokenize(texts[1]) logger.info('Tokens2:', tokens_text2) input_id, input_type_id = self.tokenizer.encode(first=texts[0], second=texts[1], max_len=self.max_seq_len) input_mask = [0 if ids == 0 else 1 for ids in input_id] input_ids.append(input_id) input_type_ids.append(input_type_id) input_masks.append(input_mask) # numpy处理list input_ids = np.array(input_ids) input_masks = np.array(input_masks) input_type_ids = np.array(input_type_ids) logger.info("process ok!") return [input_ids, input_masks, input_type_ids] def build_model_bilstm_layers(self): if args.use_lstm: if args.use_cudnn_cell: layer_cell = CuDNNLSTM else: layer_cell = LSTM else: if args.use_cudnn_cell: layer_cell = CuDNNGRU else: layer_cell = GRU # bert embedding bert_inputs, bert_output = KerasBertEmbedding().bert_encode() # Bi-LSTM x = Bidirectional(layer_cell(units=args.units, return_sequences=args.return_sequences, ))(bert_output) # 最后 x = TimeDistributed(Dropout(self.keep_prob))(x) dense_layer = Dense(args.max_seq_len, activation=args.activation)(x) crf = CRF(args.label, sparse_target=False, learn_mode="join", test_mode='viterbi') output_layers = crf(dense_layer) self.model = Model(bert_inputs, output_layers) self.model.summary(132) def compile_model(self): self.model.compile( optimizer=Adam(lr=args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=args.epsilon, decay=0.0), loss=crf_loss if args.use_crf else sparse_categorical_crossentropy, metrics=[crf_accuracy] if args.metrics is 'crf_loss' else args.metrics) # loss=CRF.loss_function if args.use_crf else categorical_crossentropy, # metrics=[CRF.accuracy] if args.metrics is 'crf_loss' else args.metrics) # loss=crf.loss if args.use_crf else categorical_crossentropy, # metrics=[crf.accuracy] if args.metrics is 'crf_loss' else args.metrics) def callback(self): cb = [ModelCheckpoint(monitor='val_loss', mode='min', filepath=args.path_save_model, verbose=1, save_best_only=True, save_weights_only=False), ReduceLROnPlateau(monitor='val_loss', mode='min', factor=0.2, patience=2, verbose=0, epsilon=1e-6, cooldown=4, min_lr=1e-8), EarlyStopping(monitor='val_loss', mode='min', min_delta=1e-8, patience=2) ] return cb def fit(self, x_train, y_train, x_dev, y_dev): self.model.fit(x_train, y_train, batch_size=args.batch_size, epochs=args.epochs, validation_data=(x_dev, y_dev), shuffle=True, callbacks=self.callback()) self.model.save(args.path_save_model) def load_model(self): print("BertNerBiLstmModel load_model start!") # logger.info("BertBiLstmModel load_model start!") self.model.load_weights(args.path_save_model) # logger.info("BertBiLstmModel load_model end+!") print("BertNerBiLstmModel load_model end+!") def predict(self, sen): input_ids, input_masks, input_type_ids = self.process_single([sen]) probs = self.model.predict([input_ids, input_masks], batch_size=1) probs_first = probs[0] preds = [] for prob_one in probs_first: prob_max = np.argmax(prob_one) preds.append(prob_max) return preds def predict_list(self, questions): label_preds = [] for questions_pair in questions: input_ids, input_masks, input_type_ids = self.process_single([questions_pair]) label_pred = self.model.predict([input_ids, input_masks], batch_size=1) label_preds.append(label_pred) return label_preds
#构建字典 token_dict = load_vocabulary(vocab_path) print(token_dict) print(len(token_dict)) #Tokenization tokenizer = Tokenizer(token_dict) print(tokenizer) #加载预训练模型 model = load_trained_model_from_checkpoint(config_path, checkpoint_path) print(model) #-------------------------------第二步 特征提取--------------------------------- text = '语言模型' tokens = tokenizer.tokenize(text) print(tokens) #['[CLS]', '语', '言', '模', '型', '[SEP]'] indices, segments = tokenizer.encode(first=text, max_len=512) print(indices[:10]) print(segments[:10]) #提取特征 predicts = model.predict([np.array([indices]), np.array([segments])])[0] for i, token in enumerate(tokens): print(token, predicts[i].tolist()[:5]) print("") #----------------------------第三步 多句子特征提取------------------------------ text1 = '语言模型'
class PassageTagger(object): def __init__(self, params): self.params = params self.input_size = 768 self.tagger = None self.maxclauselen = None self.maxseqlen = None pretrained_path = self.params["repfile"] config_path = os.path.join(pretrained_path, 'bert_config.json') checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt') vocab_path = os.path.join(pretrained_path, 'vocab.txt') self.bert = load_trained_model_from_checkpoint(config_path, checkpoint_path) self.bert._make_predict_function( ) # Crucial step, otherwise TF will give error. token_dict = {} with codecs.open(vocab_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) self.tokenizer = Tokenizer(token_dict) def make_data(self, trainfilename, maxseqlen=None, maxclauselen=None, label_ind=None, train=False): use_attention = self.params["use_attention"] batch_size = self.params["batch_size"] str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train) print("Filtering data") str_seqs = clean_words(str_seqs) label_seqs = to_BIO(label_seqs) if not label_ind: self.label_ind = {"none": 0} else: self.label_ind = label_ind seq_lengths = [len(seq) for seq in str_seqs] if self.maxseqlen is None: if maxseqlen: self.maxseqlen = maxseqlen elif self.params["maxseqlen"] is not None: self.maxseqlen = self.params["maxseqlen"] else: self.maxseqlen = max(seq_lengths) if self.maxclauselen is None: if maxclauselen: self.maxclauselen = maxclauselen elif self.params["maxclauselen"] is not None: self.maxclauselen = self.params["maxclauselen"] elif use_attention: sentence_lens = [] for str_seq in str_seqs: for seq in str_seq: tokens = self.tokenizer.tokenize(seq.lower()) sentence_lens.append(len(tokens)) self.maxclauselen = np.round( np.mean(sentence_lens) + 3 * np.std(sentence_lens)).astype(int) if len(self.label_ind) <= 1: for str_seq, label_seq in zip(str_seqs, label_seqs): for label in label_seq: if label not in self.label_ind: # Add new labels with values 0,1,2,.... self.label_ind[label] = len(self.label_ind) self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()} discourse_generator = BertDiscourseGenerator(self.bert, self.tokenizer, str_seqs, label_seqs, self.label_ind, batch_size, use_attention, self.maxseqlen, self.maxclauselen, train) return seq_lengths, discourse_generator # One-hot representation of labels def predict(self, discourse_generator, test_seq_lengths=None, tagger=None): if not tagger: tagger = self.tagger if test_seq_lengths is None: assert (False) else: x_lens = test_seq_lengths pred_probs = tagger.predict_generator(discourse_generator) pred_inds = np.argmax(pred_probs, axis=2) pred_label_seqs = [] for pred_ind, x_len in zip(pred_inds, x_lens): pred_label_seq = [self.rev_label_ind[pred] for pred in pred_ind][-x_len:] # If the following number is positive, it means we ignored some clauses in the test passage to make it the same length as the ones we trained on. num_ignored_clauses = max(0, x_len - len(pred_label_seq)) # Make labels for those if needed. if num_ignored_clauses > 0: warnings.warn( "Test sequence too long. Ignoring %d clauses at the beginning and labeling them none." % num_ignored_clauses) ignored_clause_labels = ["none"] * num_ignored_clauses pred_label_seq = ignored_clause_labels + pred_label_seq pred_label_seqs.append(pred_label_seq) return pred_probs, pred_label_seqs, x_lens def fit_model(self, train_generator, validation_generator, reg=0): use_attention = self.params["use_attention"] att_context = self.params["att_context"] lstm = self.params["lstm"] bidirectional = self.params["bidirectional"] crf = self.params["crf"] embedding_dropout = self.params["embedding_dropout"] high_dense_dropout = self.params["high_dense_dropout"] attention_dropout = self.params["attention_dropout"] lstm_dropout = self.params["lstm_dropout"] word_proj_dim = self.params["word_proj_dim"] lr = self.params["lr"] epoch = self.params["epoch"] batch_size = self.params["batch_size"] hard_k = self.params["hard_k"] att_proj_dim = self.params["att_proj_dim"] rec_hid_dim = self.params["rec_hid_dim"] lstm_dim = self.params["lstm_dim"] validation_split = self.params["validation_split"] early_stopping = EarlyStopping(patience=2) num_classes = len(self.label_ind) # Load discourse tagger model_config_file = open( "scidt_scibert/model_att=True_cont=LSTM_clause_lstm=False_bi=True_crf=True_config.json", "r") model_weights_file_name = "scidt_scibert/model_att=True_cont=LSTM_clause_lstm=False_bi=True_crf=True_weights" cached_tagger = model_from_json(model_config_file.read(), custom_objects={ "TensorAttention": TensorAttention, "HigherOrderTimeDistributedDense": HigherOrderTimeDistributedDense, "CRF": CRF }) cached_tagger.load_weights(model_weights_file_name) for l in cached_tagger.layers: l.trainable = True inputs = cached_tagger.input x = cached_tagger.layers[-2].output if crf: Crf = CRF(num_classes, learn_mode="join") discourse_prediction = Crf(x) tagger = Model(inputs=inputs, outputs=[discourse_prediction]) else: discourse_prediction = TimeDistributed(Dense(num_classes, activation='softmax'), name='discourse')(x) tagger = Model(inputs=inputs, outputs=[discourse_prediction]) def step_decay(current_epoch): initial_lrate = lr drop = 0.5 epochs_drop = epoch / 2 lrate = initial_lrate * np.power( drop, np.floor((1 + current_epoch) / epochs_drop)) return lrate lr_fractions = [1] decay = 0 adam = Adam(lr=lr, decay=decay) if crf: #rmsprop = RMSprop(lr=lr,decay = decay) tagger.compile(optimizer=adam, loss=Crf.loss_function, metrics=[Crf.accuracy]) else: tagger.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) tagger.summary() tagger.fit_generator(train_generator, validation_data=validation_generator, epochs=epoch, callbacks=[early_stopping], verbose=2) #for l in cached_tagger.layers: # l.trainable = True #if crf: # #rmsprop = RMSprop(lr=lr,decay = decay) # tagger.compile(optimizer=adam, loss=Crf.loss_function, metrics=[Crf.accuracy]) #else: # tagger.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) #tagger.summary() #tagger.fit_generator(train_generator, validation_data=validation_generator, epochs=epoch, callbacks=[early_stopping], verbose=2) return tagger def train(self, train_generator, validation_generator): save = self.params["save"] f_mean, f_std, original_f_mean, original_f_std = 0, 0, 0, 0 self.tagger = self.fit_model(train_generator, validation_generator) if save: model_ext = "att=%s_cont=%s_lstm=%s_bi=%s_crf=%s" % ( str(self.params["use_attention"]), self.params["att_context"], str(self.params["lstm"]), str( self.params["bidirectional"]), str(self.params["crf"])) model_config_file = open("model_%s_config.json" % model_ext, "w") model_weights_file_name = "model_%s_weights" % model_ext model_label_ind = "model_%s_label_ind.json" % model_ext print(self.tagger.to_json(), file=model_config_file) self.tagger.save_weights(model_weights_file_name, overwrite=True) json.dump(self.label_ind, open(model_label_ind, "w")) return f_mean, f_std, original_f_mean, original_f_std
from keras.layers import * from keras.models import Model import keras.backend as K from keras.optimizers import Adam from keras_bert import load_trained_model_from_checkpoint, Tokenizer, get_model bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None) for l in bert_model.layers: l.trainable = True x1_in = Input(shape=(None, )) x2_in = Input(shape=(None, )) x = bert_model([x1_in, x2_in]) # Tokenization from keras_bert import Tokenizer tokenizer = Tokenizer(token_dict) # text = '语言模型 chinese is great' # text='商品名称及规格型号' # text='境外收货人\nDERCOCHILEREPUESTOSS.A.' # text='合同协议号\n2019CICSA473-A' text = '运抵国(地区)\n智利' tokens = tokenizer.tokenize(text) # ['[CLS]', '语', '言', '模', '型', '[SEP]'] print('tokens', tokens) indices, segments = tokenizer.encode(first=text, max_len=512) print(indices[:10])
class Embeddings(object): def __init__(self, name, path='./embedding-registry.json', lang='en', extension='vec', use_ELMo=False, use_BERT=False, use_cache=True, load=True): self.name = name self.embed_size = 0 self.static_embed_size = 0 self.vocab_size = 0 self.model = {} self.registry = self._load_embedding_registry(path) self.lang = lang self.extension = extension self.embedding_lmdb_path = None if self.registry is not None: self.embedding_lmdb_path = self.registry["embedding-lmdb-path"] self.env = None if load: self.make_embeddings_simple(name) self.static_embed_size = self.embed_size self.bilm = None self.use_cache = use_cache # below init for using ELMo embeddings self.use_ELMo = use_ELMo if use_ELMo: self.make_ELMo() self.embed_size = ELMo_embed_size + self.embed_size description = self.get_description('elmo-' + self.lang) self.env_ELMo = None if description and description["cache-training"] and self.use_cache: self.embedding_ELMo_cache = os.path.join( description["path-cache"], "cache") # clean possible remaining cache self.clean_ELMo_cache() # create and load a cache in write mode, it will be used only for training self.env_ELMo = lmdb.open(self.embedding_ELMo_cache, map_size=map_size) # below init for using BERT embeddings (extracted features only, not fine tuning), # similar to ELMo for this usage self.use_BERT = use_BERT if use_BERT: # to avoid issue with tf graph and thread, we maintain in the class its own graph and session #self.session = tf.Session() self.graph = tf.get_default_graph() #self.session.run(tf.global_variables_initializer()) self.make_BERT() self.embed_size = BERT_embed_size + self.embed_size description = self.get_description('bert-base-' + self.lang) self.env_BERT = None if description and description["cache-training"] and self.use_cache: self.embedding_BERT_cache = os.path.join( description["path-cache"], "cache") # clean possible remaining cache self.clean_BERT_cache() # create and load a cache in write mode, it will be used only for training self.env_BERT = lmdb.open(self.embedding_BERT_cache, map_size=map_size) def __getattr__(self, name): return getattr(self.model, name) def _load_embedding_registry(self, path='./embedding-registry.json'): """ Load the description of available embeddings. Each description provides a name, a file path (used only if necessary) and a embeddings type (to take into account small variation of format) """ registry_json = open(path).read() return json.loads(registry_json) def make_embeddings_simple_in_memory(self, name="fasttext-crawl"): nbWords = 0 print('loading embeddings...') begin = True description = self.get_description(name) if description is not None: embeddings_path = description["path"] self.lang = description["lang"] print("path:", embeddings_path) if self.extension == 'bin': self.model = fastText.load_model(embeddings_path) nbWords = len(self.model.get_words()) self.embed_size = self.model.get_dimension() else: with open(embeddings_path, encoding='utf8') as f: for line in f: line = line.strip() line = line.split(' ') if begin: begin = False nb_words, embed_size = _fetch_header_if_available( line) # we parse the header if nb_words > 0 and embed_size > 0: nbWords = nb_words self.embed_size = embed_size continue word = line[0] vector = np.array( [float(val) for val in line[1:len(line)]], dtype='float32') #else: # vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32') if self.embed_size == 0: self.embed_size = len(vector) self.model[word] = vector if nbWords == 0: nbWords = len(self.model) print('embeddings loaded for', nbWords, "words and", self.embed_size, "dimensions") def make_embeddings_lmdb(self, name="fasttext-crawl"): print( '\nCompiling embeddings... (this is done only one time per embeddings at first usage)' ) description = self.get_description(name) if description is None: print( '\nNo description found in embeddings registry for embeddings', name) return if description is not None: # the following method will possibly download the mebedding file if not available locally embeddings_path = self.get_embedding_path(description) if embeddings_path is None: print('\nCould not locate a usable resource for embeddings', name) return self.load_embeddings_from_file(embeddings_path) # cleaning possible downloaded embeddings self.clean_downloads() def load_embeddings_from_file(self, embeddings_path): begin = True nbWords = 0 txn = self.env.begin(write=True) # batch_size = 1024 i = 0 nb_lines = 0 # read number of lines first embedding_file = open_embedding_file(embeddings_path) if embedding_file is None: print("Error: could not open embeddings file", embeddings_path) return for line in embedding_file: nb_lines += 1 embedding_file.close() embedding_file = open_embedding_file(embeddings_path) #with open(embeddings_path, encoding='utf8') as f: for line in tqdm(embedding_file, total=nb_lines): line = line.decode() line = line.split(' ') if begin: begin = False nb_words, embed_size = _fetch_header_if_available(line) if nb_words > 0 and embed_size > 0: nbWords = nb_words self.embed_size = embed_size continue word = line[0] try: if line[len(line) - 1] == '\n': vector = np.array( [float(val) for val in line[1:len(line) - 1]], dtype='float32') else: vector = np.array( [float(val) for val in line[1:len(line)]], dtype='float32') #vector = np.array([float(val) for val in line[1:len(line)]], dtype='float32') except: print(len(line)) print(line[1:len(line)]) #else: # vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32') if self.embed_size == 0: self.embed_size = len(vector) if len(word.encode(encoding='UTF-8')) < self.env.max_key_size(): txn.put(word.encode(encoding='UTF-8'), _serialize_pickle(vector)) #txn.put(word.encode(encoding='UTF-8'), _serialize_byteio(vector)) i += 1 # commit batch # if i % batch_size == 0: # txn.commit() # txn = self.env.begin(write=True) embedding_file.close() #if i % batch_size != 0: txn.commit() if nbWords == 0: nbWords = i self.vocab_size = nbWords print('embeddings loaded for', nbWords, "words and", self.embed_size, "dimensions") def clean_downloads(self): # cleaning possible downloaded embeddings for filename in os.listdir(self.registry['embedding-download-path']): file_path = os.path.join(self.registry['embedding-download-path'], filename) try: if os.path.isfile(file_path) or os.path.islink(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: print('Failed to delete %s. Reason: %s' % (file_path, e)) def make_embeddings_simple(self, name="fasttext-crawl"): description = self.get_description(name) if description is not None: self.extension = description["format"] if self.extension == "bin": if fasttext_support == True: print( "embeddings are of .bin format, so they will be loaded in memory..." ) self.make_embeddings_simple_in_memory(name) else: if not (sys.platform == 'linux' or sys.platform == 'darwin'): raise ValueError( 'FastText .bin format not supported for your platform') else: raise ValueError( 'Go to the documentation to get more information on how to install FastText .bin support' ) elif self.embedding_lmdb_path is None or self.embedding_lmdb_path == "None": print( "embedding_lmdb_path is not specified in the embeddings registry, so the embeddings will be loaded in memory..." ) self.make_embeddings_simple_in_memory(name) else: # if the path to the lmdb database files does not exist, we create it if not os.path.isdir(self.embedding_lmdb_path): # conservative check (likely very useless) if not os.path.exists(self.embedding_lmdb_path): os.makedirs(self.embedding_lmdb_path) # check if the lmdb database exists envFilePath = os.path.join(self.embedding_lmdb_path, name) load_db = True if os.path.isdir(envFilePath): description = self.get_description(name) if description is not None: self.lang = description["lang"] # open the database in read mode self.env = lmdb.open(envFilePath, readonly=True, max_readers=2048, max_spare_txns=4) if self.env: # we need to set self.embed_size and self.vocab_size with self.env.begin() as txn: stats = txn.stat() size = stats['entries'] self.vocab_size = size with self.env.begin() as txn: cursor = txn.cursor() for key, value in cursor: vector = _deserialize_pickle(value) self.embed_size = vector.shape[0] break cursor.close() if self.vocab_size > 100 and self.embed_size > 10: # lmdb database exists and looks valid load_db = False # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env.close() self.env = lmdb.open(envFilePath, readonly=True, max_readers=2048, max_spare_txns=2) if load_db: # create and load the database in write mode self.env = lmdb.open(envFilePath, map_size=map_size) self.make_embeddings_lmdb(name) def make_ELMo(self): # Location of pretrained BiLM for the specified language # TBD check if ELMo language resources are present description = self.get_description('elmo-' + self.lang) if description is not None: self.lang = description["lang"] vocab_file = description["path-vocab"] options_file = description["path-config"] weight_file = description["path_weights"] print('init ELMo') # Create a Batcher to map text to character ids self.batcher = Batcher(vocab_file, 50) # Build the biLM graph. self.bilm = BidirectionalLanguageModel(self.lang, options_file, weight_file) # Input placeholders to the biLM. self.character_ids = tf.placeholder('int32', shape=(None, None, 50)) with tf.variable_scope(self.lang, reuse=tf.AUTO_REUSE): # the reuse=True scope reuses weights from the whole context self.embeddings_op = self.bilm(self.character_ids) self.elmo_input = weight_layers('input', self.embeddings_op, l2_coef=0.0) def make_BERT(self): # Location of BERT model description = self.get_description('bert-base-' + self.lang) if description is not None: self.lang = description["lang"] config_file = description["path-config"] weight_file = description["path-weights"] vocab_file = description["path-vocab"] print('init BERT') # load the pretrained model with self.graph.as_default(): # there are different typical pooling strategies for getting BERT features: # - concatenation of 4 last layers (the one from the original BERT paper, BERT_embed_size is then 3072) # - last layer (BERT_embed_size is 768) # - average of 4 last layers (BERT_embed_size is 768) # - sum of the 4 last layers (BERT_embed_size is 768) self.bert_model = load_trained_model_from_checkpoint( config_file, weight_file, output_layer_num=4) self.bert_model.summary(line_length=120) self.bert_model._make_predict_function() # init the tokenizer token_dict = {} with codecs.open(vocab_file, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) print('token_dict size:', len(token_dict)) self.bert_tokenizer = Tokenizer(token_dict, cased=True) def get_sentence_vector_only_ELMo(self, token_list): """ Return the ELMo embeddings only for a full sentence """ if not self.use_ELMo: print( "Warning: ELMo embeddings requested but embeddings object wrongly initialised" ) return # Create batches of data local_token_ids = self.batcher.batch_sentences(token_list) max_size_sentence = local_token_ids[0].shape[0] # check lmdb cache elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence) if elmo_result is not None: return elmo_result with tf.Session() as sess: # weird, for this cpu is faster than gpu (1080Ti !) with tf.device("/cpu:0"): # It is necessary to initialize variables once before running inference sess.run(tf.global_variables_initializer()) # Compute ELMo representations (2 times as a heavy warm-up) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) #cache computation self.cache_ELMo_lmdb_vector(token_list, elmo_result) return elmo_result def get_sentence_vector_with_ELMo(self, token_list): """ Return a concatenation of standard embeddings (e.g. Glove) and ELMo embeddings for a full sentence """ if not self.use_ELMo: print( "Warning: ELMo embeddings requested but embeddings object wrongly initialised" ) return #print("\ntoken_list:", token_list) local_token_ids = self.batcher.batch_sentences(token_list) #print("local_token_ids:", local_token_ids) max_size_sentence = local_token_ids[0].shape[0] elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence) if elmo_result is None: with tf.Session() as sess: # weird, for this cpu is faster than gpu (1080Ti !) with tf.device("/cpu:0"): # It is necessary to initialize variables once before running inference sess.run(tf.global_variables_initializer()) # Compute ELMo representations (2 times as a heavy warm-up) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) #cache computation self.cache_ELMo_lmdb_vector(token_list, elmo_result) concatenated_result = np.zeros( (len(token_list), max_size_sentence - 2, self.embed_size), dtype=np.float32) #concatenated_result = np.random.rand(elmo_result.shape[0], max_size_sentence-2, self.embed_size) for i in range(0, len(token_list)): for j in range(0, len(token_list[i])): #if is_int(token_list[i][j]) or is_float(token_list[i][j]): #dummy_result = np.zeros((elmo_result.shape[2]), dtype=np.float32) #concatenated_result[i][j] = np.concatenate((dummy_result, self.get_word_vector(token_list[i][j])), ) #else: concatenated_result[i][j] = np.concatenate( (elmo_result[i][j], self.get_word_vector( token_list[i][j]).astype('float32')), ) #concatenated_result[i][j] = np.concatenate((self.get_word_vector(token_list[i][j]), elmo_result[i][j]), ) return concatenated_result def get_sentence_vector_only_BERT(self, token_list): """ Return the BERT extracted embeddings only for a full sentence """ if not self.use_BERT: print( "Warning: BERT embeddings requested but embeddings object wrongly initialised" ) return #print("local_token_ids:", local_token_ids) max_size_token_list = 0 for i, sentence in enumerate(token_list): if len(sentence) > max_size_token_list: max_size_token_list = len(sentence) # retokenize with BERT tokenizer max_size = BERT_sentence_size max_size_sentence = 0 new_token_list = [] bert_results = np.zeros((len(token_list), max_size, BERT_embed_size), dtype=np.float32) for i, sentence in enumerate(token_list): local_text = " ".join(sentence) local_tokens = self.bert_tokenizer.tokenize(local_text) bert_result = self.get_BERT_lmdb_vector(sentence) if bert_result is None: indices, segments = self.bert_tokenizer.encode( local_text, max_len=max_size) with self.graph.as_default(): bert_result = self.bert_model.predict( [np.array([indices]), np.array([segments])])[0] #cache computation if bert_result is not None: self.cache_BERT_lmdb_vector(sentence, bert_result) # Realign BERT tokenization with the provided tokenization. Normally BERT segmenter always # over-segment as compared to DeLFT segmenter. # There are two obvious possibilities to combine subtoken embeddings into token embeddings, # either take the embeddings of the last subtoken, of use the average vector of the subtokens. new_bert_result = np.zeros((max_size, BERT_embed_size), dtype=np.float32) token_tensor = [] tid = 0 buffer = '' #print(sentence) #print(local_tokens) for j, t in enumerate(local_tokens): if j >= max_size: break if t == '[CLS]' or t == '[SEP]': continue else: if t.startswith('##'): t = t[2:] buffer += t #print(buffer) token_tensor.append(bert_result[j]) if buffer == sentence[tid]: # average vector of the subtokens new_bert_result[tid] = np.stack(token_tensor).mean( axis=0) # or last subtoken vector #new_bert_result[tid] = token_tensor[-1] token_tensor = [] buffer = '' tid += 1 bert_result = new_bert_result if bert_result is not None: bert_results[i] = bert_result # we need to squeze the vector to max_size_token_list squeezed_bert_results = np.zeros( (len(token_list), max_size_token_list, BERT_embed_size), dtype=np.float32) for i, sentence in enumerate(token_list): squeezed_bert_results[i] = bert_results[i][:max_size_token_list] return squeezed_bert_results def get_sentence_vector_with_BERT(self, token_list): """ Return a concatenation of standard embeddings (e.g. Glove) and BERT extracted embeddings for a full sentence """ if not self.use_BERT: print( "Warning: BERT embeddings requested but embeddings object wrongly initialised" ) return max_size_token_list = 0 for i, sentence in enumerate(token_list): if len(sentence) > max_size_token_list: max_size_token_list = len(sentence) squeezed_bert_results = self.get_sentence_vector_only_BERT(token_list) concatenated_squeezed_result = np.zeros( (len(token_list), max_size_token_list, self.embed_size), dtype=np.float32) for i, sentence in enumerate(token_list): for j in range(0, len(token_list[i])): concatenated_squeezed_result[i][j] = np.concatenate( (squeezed_bert_results[i][j], self.get_word_vector( token_list[i][j]).astype('float32')), ) return concatenated_squeezed_result def get_description(self, name): for emb in self.registry["embeddings"]: if emb["name"] == name: return emb for emb in self.registry["embeddings-contextualized"]: if emb["name"] == name: return emb for emb in self.registry["transformers"]: if emb["name"] == name: return emb return None def get_word_vector(self, word): """ Get static embeddings (e.g. glove) for a given token """ if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'): # the pre-trained embeddings are not cased word = word.lower() if self.env is None or self.extension == 'bin': # db not available or embeddings in bin format, the embeddings should be available in memory (normally!) return self.get_word_vector_in_memory(word) try: with self.env.begin() as txn: vector = txn.get(word.encode(encoding='UTF-8')) if vector: word_vector = _deserialize_pickle(vector) vector = None else: word_vector = np.zeros((self.static_embed_size, ), dtype=np.float32) # alternatively, initialize with random negative values #word_vector = np.random.uniform(low=-0.5, high=0.0, size=(self.embed_size,)) # alternatively use fasttext OOV ngram possibilities (if ngram available) except lmdb.Error: # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env.close() envFilePath = os.path.join(self.embedding_lmdb_path, self.name) self.env = lmdb.open(envFilePath, readonly=True, max_readers=2048, max_spare_txns=2, lock=False) return self.get_word_vector(word) return word_vector def get_ELMo_lmdb_vector(self, token_list, max_size_sentence): """ Try to get the ELMo embeddings for a sequence cached in LMDB """ if self.env_ELMo is None: # db cache not available, we don't cache ELMo stuff return None try: ELMo_vector = np.zeros( (len(token_list), max_size_sentence - 2, ELMo_embed_size), dtype='float32') with self.env_ELMo.begin() as txn: for i in range(0, len(token_list)): txn = self.env_ELMo.begin() # get a hash for the token_list the_hash = list_digest(token_list[i]) vector = txn.get(the_hash.encode(encoding='UTF-8')) if vector: # adapt expected shape/padding local_embeddings = _deserialize_pickle(vector) if local_embeddings.shape[0] > max_size_sentence - 2: # squeeze the extra padding space ELMo_vector[ i] = local_embeddings[:max_size_sentence - 2, ] elif local_embeddings.shape[ 0] == max_size_sentence - 2: # bingo~! ELMo_vector[i] = local_embeddings else: # fill the missing space with padding filler = np.zeros((max_size_sentence - (local_embeddings.shape[0] + 2), ELMo_embed_size), dtype='float32') ELMo_vector[i] = np.concatenate( (local_embeddings, filler)) vector = None else: return None except lmdb.Error: # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env_ELMo.close() self.env_ELMo = lmdb.open(self.embedding_ELMo_cache, readonly=True, max_readers=2048, max_spare_txns=2, lock=False) return self.get_ELMo_lmdb_vector(token_list) return ELMo_vector def get_BERT_lmdb_vector(self, sentence): """ Try to get the BERT extracted embeddings for a sequence cached in LMDB """ if self.env_BERT is None: # db cache not available, we don't cache ELMo stuff return None try: BERT_vector = np.zeros((BERT_sentence_size, BERT_embed_size), dtype='float32') with self.env_BERT.begin() as txn: txn = self.env_BERT.begin() # get a hash for the token_list the_hash = list_digest(sentence) vector = txn.get(the_hash.encode(encoding='UTF-8')) if vector: # adapt expected shape/padding BERT_vector = _deserialize_pickle(vector) ''' if local_embeddings.shape[0] > max_size_sentence: # squeeze the extra padding space BERT_vector = local_embeddings[:max_size_sentence,] elif local_embeddings.shape[0] == max_size_sentence: # bingo~! BERT_vector = local_embeddings else: # fill the missing space with padding filler = np.zeros((max_size_sentence-(local_embeddings.shape[0]), BERT_embed_size), dtype='float32') BERT_vector = np.concatenate((local_embeddings, filler)) ''' vector = None else: return None except lmdb.Error: # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env_BERT.close() self.env_BERT = lmdb.open(self.embedding_BERT_cache, readonly=True, max_readers=2048, max_spare_txns=2, lock=False) return self.get_BERT_lmdb_vector(sentence) return BERT_vector def cache_ELMo_lmdb_vector(self, token_list, ELMo_vector): """ Cache in LMDB the ELMo embeddings for a given sequence """ if self.env_ELMo is None: # db cache not available, we don't cache ELMo stuff return None txn = self.env_ELMo.begin(write=True) for i in range(0, len(token_list)): # get a hash for the token_list the_hash = list_digest(token_list[i]) txn.put(the_hash.encode(encoding='UTF-8'), _serialize_pickle(ELMo_vector[i])) txn.commit() def cache_BERT_lmdb_vector(self, sentence, BERT_vector): """ Cache in LMDB the BERT embeddings for a given sequence """ if self.env_BERT is None: # db cache not available, we don't cache BERT stuff return None txn = self.env_BERT.begin(write=True) #for i in range(0, len(sentence)): # get a hash for the token_list the_hash = list_digest(sentence) txn.put(the_hash.encode(encoding='UTF-8'), _serialize_pickle(BERT_vector)) txn.commit() def clean_ELMo_cache(self): """ Delete ELMo embeddings cache, this takes place normally after the completion of a training """ if self.env_ELMo is None: # db cache not available, nothing to clean return else: self.env_ELMo.close() self.env_ELMo = None for file in os.listdir(self.embedding_ELMo_cache): file_path = os.path.join(self.embedding_ELMo_cache, file) if os.path.isfile(file_path): os.remove(file_path) os.rmdir(self.embedding_ELMo_cache) def clean_BERT_cache(self): """ Delete BERT embeddings cache, this takes place normally after the completion of a training """ # if cache subdirectory does not exist, we create it if not os.path.exists(self.embedding_BERT_cache): os.makedirs(self.embedding_BERT_cache) return if self.env_BERT is None: # db cache not available, nothing to clean return else: self.env_BERT.close() self.env_BERT = None for file in os.listdir(self.embedding_BERT_cache): file_path = os.path.join(self.embedding_BERT_cache, file) if os.path.isfile(file_path): os.remove(file_path) os.rmdir(self.embedding_BERT_cache) def get_word_vector_in_memory(self, word): if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'): # the pre-trained embeddings are not cased word = word.lower() if self.extension == 'bin': return self.model.get_word_vector(word) if word in self.model: return self.model[word] else: # for unknown word, we use a vector filled with 0.0 return np.zeros((self.static_embed_size, ), dtype=np.float32) # alternatively, initialize with random negative values #return np.random.uniform(low=-0.5, high=0.0, size=(self.embed_size,)) # alternatively use fasttext OOV ngram possibilities (if ngram available) def get_embedding_path(self, description): embeddings_path = None if "path" in description: embeddings_path = description["path"] self.lang = description["lang"] if embeddings_path is None or not os.path.isfile(embeddings_path): print("error: embedding path for", description['name'], "is not valid", embeddings_path) if "url" in description and len(description["url"]) > 0: url = description["url"] download_path = self.registry['embedding-download-path'] # if the download path does not exist, we create it if not os.path.isdir(download_path): try: os.mkdir(download_path) except OSError: print("Creation of the download directory", download_path, "failed") print("Downloading resource file for", description['name'], "...") embeddings_path = download_file(url, download_path) if embeddings_path != None and os.path.isfile(embeddings_path): print("Download sucessful:", embeddings_path) else: print( "no download url available for this embeddings resource, please review the embedding registry for", description['name']) return embeddings_path
# -*- coding: utf-8 -*- """ Created on Tue Nov 23 15:35:48 2021 @author: xiuzhang """ from keras_bert import Tokenizer token_dict = { '[CLS]': 0, '[SEP]': 1, 'un': 2, '##aff': 3, '##able': 4, '[UNK]': 5, } #分词器-Tokenizer tokenizer = Tokenizer(token_dict) print(tokenizer.tokenize('unaffable')) #拆分单词 indices, segments = tokenizer.encode('unaffable') print(indices) #字对应索引 print(segments) #索引对应位置上字属于第一句话还是第二句话 print(tokenizer.tokenize(first='unaffable', second='钢')) indices, segments = tokenizer.encode(first='unaffable', second='钢', max_len=10) print(indices) print(segments)
class KerasBertVector(): def __init__(self): self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len # 全局使用,使其可以django、flask、tornado等调用 global graph graph = tf.get_default_graph() global model model = load_trained_model_from_checkpoint(self.config_path, self.checkpoint_path, seq_len=self.max_seq_len) model.summary(120) # 如果只选一层,就只取对应那一层的weight if len(layer_indexes) == 1: encoder_layer = model.get_layer(index=len(model.layers)-2).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: # layer_indexes must be [1,2,3,......12] all_layers = [model.get_layer(index=lay).output for lay in layer_indexes] encoder_layer = k_keras.concatenate(all_layers, -1) output_layer = NonMaskingLayer()(encoder_layer) model = Model(model.inputs, output_layer) # reader tokenizer self.token_dict = {} with codecs.open(self.dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.tokenizer = Tokenizer(self.token_dict) def bert_encode(self, texts): # 文本预处理 input_ids = [] input_masks = [] input_type_ids = [] for text in texts: print(text) tokens_text = self.tokenizer.tokenize(text) print('Tokens:', tokens_text) input_id, input_type_id = self.tokenizer.encode(first=text, max_len=self.max_seq_len) input_mask = [0 if ids == 0 else 1 for ids in input_id] input_ids.append(input_id) input_type_ids.append(input_type_id) input_masks.append(input_mask) input_ids = np.array(input_ids) input_masks = np.array(input_masks) input_type_ids = np.array(input_type_ids) # 全局使用,使其可以django、flask、tornado等调用 with graph.as_default(): predicts = model.predict([input_ids, input_type_ids], batch_size=1) print(predicts.shape) for i, token in enumerate(tokens_text): print(token, [len(predicts[0][i].tolist())], predicts[0][i].tolist()) # 相当于pool,采用的是https://github.com/terrifyzhao/bert-utils/blob/master/graph.py mul_mask = lambda x, m: x * np.expand_dims(m, axis=-1) masked_reduce_mean = lambda x, m: np.sum(mul_mask(x, m), axis=1) / (np.sum(m, axis=1, keepdims=True) + 1e-9) pooled = masked_reduce_mean(predicts[0][-1], input_masks) pooled = pooled.tolist() print('bert:', pooled) return pooled
from keras_bert.datasets import get_pretrained, PretrainedList model_path = get_pretrained( PretrainedList.chinese_base) # download chinese pre-trained model paths = get_checkpoint_paths(model_path) model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, seq_len=10) model.summary(line_length=120) plot_model(model, to_file="keras_bert.png", show_shapes=True) # loss确定 SEP 标记? token_dict = load_vocabulary(paths.vocab) tokenizer = Tokenizer(token_dict) text = '语言模型' tokens = tokenizer.tokenize(text) print('Tokens:', tokens) indices, segments = tokenizer.encode(first=text, max_len=10) print("indices:", indices) print("segments:", segments) predicts = model.predict([np.array([indices]), np.array([segments])])[0] for i, token in enumerate(tokens): print(token, predicts[i].tolist()[:5]) # extract word embedding # load and predict model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, training=True, seq_len=None)
class BertBiLstmModel(): def __init__(self): # logger.info("BertBiLstmModel init start!") print("BertBiLstmModel init start!") self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len # reader tokenizer self.token_dict = {} with codecs.open(self.dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.tokenizer = Tokenizer(self.token_dict) # 你可以选择一个model build,有bi-lstm single、bi-lstm 3-layers、bi-lstm_attention # self.build_model_bilstm_layers() # self.build_model_bilstm_single() self.build_model_bilstm_attention() # logger.info("BertBiLstmModel init end!") print("BertBiLstmModel init end!") def process_single(self, texts): # 文本预处理,传入一个list,返回的是ids\mask\type-ids input_ids = [] input_masks = [] input_type_ids = [] for text in texts: logger.info(text) tokens_text = self.tokenizer.tokenize(text) logger.info('Tokens:', tokens_text) input_id, input_type_id = self.tokenizer.encode(first=text, max_len=self.max_seq_len) input_mask = [0 if ids == 0 else 1 for ids in input_id] input_ids.append(input_id) input_type_ids.append(input_type_id) input_masks.append(input_mask) # numpy处理list input_ids = np.array(input_ids) input_masks = np.array(input_masks) input_type_ids = np.array(input_type_ids) logger.info("process ok!") return input_ids, input_masks, input_type_ids def process_pair(self, textss): # 文本预处理,传入一个list,返回的是ids\mask\type-ids input_ids = [] input_masks = [] input_type_ids = [] for texts in textss: tokens_text = self.tokenizer.tokenize(texts[0]) logger.info('Tokens1:', tokens_text) tokens_text2 = self.tokenizer.tokenize(texts[1]) logger.info('Tokens2:', tokens_text2) input_id, input_type_id = self.tokenizer.encode(first=texts[0], second=texts[1], max_len=self.max_seq_len) input_mask = [0 if ids == 0 else 1 for ids in input_id] input_ids.append(input_id) input_type_ids.append(input_type_id) input_masks.append(input_mask) # numpy处理list input_ids = np.array(input_ids) input_masks = np.array(input_masks) input_type_ids = np.array(input_type_ids) logger.info("process ok!") return input_ids, input_masks, input_type_ids def build_model_bilstm_layers(self): if args.use_lstm: if args.use_cudnn_cell: layer_cell = CuDNNLSTM else: layer_cell = LSTM else: if args.use_cudnn_cell: layer_cell = CuDNNGRU else: layer_cell = GRU # bert embedding bert_inputs, bert_output = KerasBertEmbedding().bert_encode() # bert_output = bert_output[:0:] # layer_get_cls = Lambda(lambda x: x[:, 0:1, :]) # bert_output = layer_get_cls(bert_output) # print("layer_get_cls:") # print(bert_output.shape) # Bi-LSTM x = Bidirectional(layer_cell(units=args.units, return_sequences=args.return_sequences, kernel_regularizer=regularizers.l2(args.l2 * 0.1), recurrent_regularizer=regularizers.l2(args.l2) ))(bert_output) # blstm_layer = TimeDistributed(Dropout(args.keep_prob))(blstm_layer) 这个用不了,好像是输入不对, dims<3吧 x = Dropout(args.keep_prob)(x) x = Bidirectional(layer_cell(units=args.units, return_sequences=args.return_sequences, kernel_regularizer=regularizers.l2(args.l2 * 0.1), recurrent_regularizer=regularizers.l2(args.l2)))(x) x = Dropout(args.keep_prob)(x) x = Bidirectional(layer_cell(units=args.units, return_sequences=args.return_sequences, kernel_regularizer=regularizers.l2(args.l2 * 0.1), recurrent_regularizer=regularizers.l2(args.l2)))(x) x = Dropout(args.keep_prob)(x) # 平均池化、最大池化拼接 avg_pool = GlobalAvgPool1D()(x) max_pool = GlobalMaxPool1D()(x) print(max_pool.shape) print(avg_pool.shape) concat = concatenate([avg_pool, max_pool]) x = Dense(int(args.units / 4), activation="relu")(concat) x = Dropout(args.keep_prob)(x) # 最后就是softmax dense_layer = Dense(args.label, activation=args.activation)(x) output_layers = [dense_layer] self.model = Model(bert_inputs, output_layers) def build_model_bilstm_attention(self): if args.use_lstm: if args.use_cudnn_cell: layer_cell = CuDNNLSTM else: layer_cell = LSTM else: if args.use_cudnn_cell: layer_cell = CuDNNGRU else: layer_cell = GRU # bert embedding bert_inputs, bert_output = KerasBertEmbedding().bert_encode() # Bi-LSTM x = Bidirectional(layer_cell(units=args.units, return_sequences=args.return_sequences, kernel_regularizer=regularizers.l2(args.l2 * 0.1), recurrent_regularizer=regularizers.l2(args.l2) ))(bert_output) x = TimeDistributed(Dropout(args.keep_prob))(x) # 这个用不了,好像是输入不对, dims<3吧 x = attention(x) x = Flatten()(x) x = Dropout(args.keep_prob)(x) # # 平均池化、最大池化拼接 # avg_pool = GlobalAvgPool1D()(x) # max_pool = GlobalMaxPool1D()(x) # print(max_pool.shape) # print(avg_pool.shape) # concat = concatenate([avg_pool, max_pool]) # x = Dense(int(args.units/4), activation="relu")(concat) # x = Dropout(args.keep_prob)(x) # 最后就是softmax dense_layer = Dense(args.label, activation=args.activation)(x) output_layers = [dense_layer] self.model = Model(bert_inputs, output_layers) def build_model_bilstm_single(self): if args.use_lstm: if args.use_cudnn_cell: layer_cell = CuDNNLSTM else: layer_cell = LSTM else: if args.use_cudnn_cell: layer_cell = CuDNNGRU else: layer_cell = GRU # bert embedding bert_inputs, bert_output = KerasBertEmbedding().bert_encode() # Bi-LSTM x = Bidirectional(layer_cell(units=args.units, return_sequences=args.return_sequences, kernel_regularizer=regularizers.l2(args.l2 * 0.1), recurrent_regularizer=regularizers.l2(args.l2) ))(bert_output) x = Dropout(args.keep_prob)(x) # 最后就是softmax dense_layer = Dense(args.label, activation=args.activation)(x) output_layers = [dense_layer] self.model = Model(bert_inputs, output_layers) def compile_model(self): self.model.compile(optimizer=args.optimizers, loss=categorical_crossentropy, metrics=args.metrics) def callback(self): cb = [ModelCheckpoint(args.path_save_model, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='min'), EarlyStopping(min_delta=1e-8, patience=10, mode='min'), ReduceLROnPlateau(factor=0.2, patience=6, verbose=0, mode='min', epsilon=1e-6, cooldown=4, min_lr=1e-8) ] return cb def fit(self, x_train, y_train, x_dev, y_dev): self.model.fit(x_train, y_train, batch_size=args.batch_size, epochs=args.epochs, validation_data=(x_dev, y_dev), shuffle=True, callbacks=self.callback()) self.model.save(args.path_save_model) def load_model(self): print("BertBiLstmModel load_model start!") # logger.info("BertBiLstmModel load_model start!") self.model.load_weights(args.path_save_model) # logger.info("BertBiLstmModel load_model end+!") print("BertBiLstmModel load_model end+!") def predict(self, sen_1, sen_2): input_ids, input_masks, input_type_ids = self.process_pair([[sen_1, sen_2]]) return self.model.predict([input_ids, input_masks], batch_size=1) def predict_list(self, questions): label_preds = [] for questions_pair in questions: input_ids, input_masks, input_type_ids = self.process_pair([questions_pair]) label_pred = self.model.predict([input_ids, input_masks], batch_size=1) label_preds.append(label_pred[0]) return label_preds
class KerasBertVector (): def __init__(self): self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len # 全局使用,使其可以django、flask、tornado等调用 # global graph # graph = tf.compat.v1.get_default_graph () global model model = load_trained_model_from_checkpoint (self.config_path, self.checkpoint_path, seq_len=self.max_seq_len) # print (model.output) # print (len (model.layers)) # lay = model.layers # 一共104个layer,其中前八层包括token,pos,embed等, # 每4层(MultiHeadAttention,Dropout,Add,LayerNormalization) # 一共24层 layer_dict = [7] layer_0 = 7 for i in range (12): layer_0 = layer_0 + 4 layer_dict.append (layer_0) # 输出它本身 if len (layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,就只取最后那一层的weight,取得不正确 elif len (layer_indexes) == 1: if layer_indexes[0] in [i + 1 for i in range (12)]: encoder_layer = model.get_layer (index=layer_dict[layer_indexes[0]]).output else: encoder_layer = model.get_layer (index=layer_dict[-2]).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: # layer_indexes must be [1,2,3,......12...24] # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] all_layers = [model.get_layer (index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range (12)] else model.get_layer (index=layer_dict[-1]).output # 如果给出不正确,就默认输出最后一层 for lay in layer_indexes] # print (layer_indexes) # print (all_layers) # 其中layer==1的output是格式不对,第二层输入input是list all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append (all_layers_one) encoder_layer = Add () (all_layers_select) # print (encoder_layer.shape) # print ("KerasBertEmbedding:") # print (encoder_layer.shape) output_layer = NonMaskingLayer () (encoder_layer) model = Model (model.inputs, output_layer) # model.summary(120) # reader tokenizer self.token_dict = {} with codecs.open (self.dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip () self.token_dict[token] = len (self.token_dict) self.tokenizer = Tokenizer (self.token_dict) def bert_encode_sen(self, texts): # 文本预处理 input_ids = [] input_masks = [] input_type_ids = [] for text in texts: # print (text) tokens_text = self.tokenizer.tokenize (text) # print ('Tokens:', tokens_text) input_id, input_type_id = self.tokenizer.encode (first=text, max_len=self.max_seq_len) input_mask = [0 if ids == 0 else 1 for ids in input_id] input_ids.append (input_id) input_type_ids.append (input_type_id) input_masks.append (input_mask) input_ids = np.array (input_ids) input_masks = np.array (input_masks) input_type_ids = np.array (input_type_ids) # 全局使用,使其可以django、flask、tornado等调用 # with graph.as_default (): predicts = model.predict ([input_ids, input_type_ids], batch_size=1) # print (predicts.shape) # for i, token in enumerate (tokens_text): # (token, [len (predicts[0][i].tolist ())], predicts[0][i].tolist ()) # 相当于pool,采用的是https://github.com/terrifyzhao/bert-utils/blob/master/graph.py mul_mask = lambda x, m: x * np.expand_dims (m, axis=-1) masked_reduce_mean = lambda x, m: np.sum (mul_mask (x, m), axis=1) / (np.sum (m, axis=1, keepdims=True) + 1e-9) pools = [] for i in range (len (predicts)): pred = predicts[i] masks = input_masks.tolist () mask_np = np.array ([masks[i]]) pooled = masked_reduce_mean (pred, mask_np) pooled = pooled.tolist () pools.append (pooled[0]) # print ('bert:', pools) return pools def bert_encode_word(self, texts): # 文本预处理 input_ids = [] input_masks = [] input_type_ids = [] for text in texts: # print (text) tokens_text = self.tokenizer.tokenize (text) # print ('Tokens:', tokens_text) input_id, input_type_id = self.tokenizer.encode (first=text, max_len=self.max_seq_len) input_mask = [0 if ids == 0 else 1 for ids in input_id] input_ids.append (input_id) input_type_ids.append (input_type_id) input_masks.append (input_mask) input_ids = np.array (input_ids) input_masks = np.array (input_masks) input_type_ids = np.array (input_type_ids) # 全局使用,使其可以django、flask、tornado等调用 # with graph.as_default (): predicts = model.predict ([input_ids, input_type_ids], batch_size=1) # print (predicts.shape) # for i, token in enumerate (tokens_text): # (token, [len (predicts[0][i].tolist ())], predicts[0][i].tolist ()) words_vec=predicts[0][1:len(tokens_text)-1] words_vec = np.array (words_vec) words_vec = (words_vec.astype (np.float32)) ret=[] for i in words_vec: ret.append(i) return ret def gen_sen_vec(self,sen): pooled = self.bert_encode_sen([sen]) vec = pooled[0] vec = np.array (vec) vec.tolist () return vec def gen_words_vec(self,sen): pooled = self.bert_encode_word ([sen]) vec = pooled[0] vec = np.array (vec) vec.tolist () return vec
except: token_dict = {} with codecs.open('uncased_L-12_H-768_A-12/vocab.txt', 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) with open('bert_token_dict.pkl', 'wb') as f: pickle.dump(token_dict, f) # with codecs.open('uncased_L-12_H-768_A-12/vocab.txt', 'r', 'utf8') as reader: # vocab = [line.strip() for line in reader] tokenizer = Tokenizer(token_dict) tokens = [ tokenizer.tokenize(" ".join(sentence)) for sentence in train_sentences + test_sentences ] maxlen = max([len(sentence) for sentence in tokens]) for i, sentence in enumerate(tokens): while len(tokens[i]) < maxlen: tokens[i].append('[PAD]') # print(os.getcwd()) # print(len(tokens[5])) print('maxlen_bert :', maxlen) # indices, segments = tokenizer.encode(first=' '.join(test_sentences[0]), max_len=maxlen) # print(indices) # print(" ".join(get_word(w) for w in indices)) # for w in indices: # print
from keras_bert import Tokenizer token_dict = { '[CLS]': 0, '[SEP]': 1, 'un': 2, '##aff': 3, '##able': 4, '[UNK]': 5, } tokenizer = Tokenizer(token_dict) print(tokenizer.tokenize('unaffable') ) # The result should be `['[CLS]', 'un', '##aff', '##able', '[SEP]']` indices, segments = tokenizer.encode('unaffable') print(indices) # Should be `[0, 2, 3, 4, 1]` print(segments) # Should be `[0, 0, 0, 0, 0]`
class BertTextCnnModel(): def __init__(self): # logger.info("BertBiLstmModel init start!") print("BertBiLstmModel init start!") self.config_path, self.checkpoint_path, self.dict_path = config_name, ckpt_name, vocab_file self.max_seq_len, self.filters, self.embedding_dim, self.keep_prob = args.max_seq_len, args.filters, args.embedding_dim, args.keep_prob self.activation, self.label = args.activation, args.label # reader tokenizer self.token_dict = {} with codecs.open(self.dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.tokenizer = Tokenizer(self.token_dict) # 这里模型可以选text-rnn、r-cnn或者是avt-cnn # self.build_model_text_cnn() # self.build_model_r_cnn() self.build_model_avt_cnn() # logger.info("BertBiLstmModel init end!") print("BertBiLstmModel init end!") def build_model_text_cnn(self): ######### text-cnn ######### # bert embedding bert_inputs, bert_output = KerasBertEmbedding().bert_encode() # text cnn bert_output_emmbed = SpatialDropout1D(rate=self.keep_prob)(bert_output) concat_out = [] for index, filter_size in enumerate(self.filters): x = Conv1D(name='TextCNN_Conv1D_{}'.format(index), filters=int(self.embedding_dim / 2), kernel_size=self.filters[index], padding='valid', kernel_initializer='normal', activation='relu')(bert_output_emmbed) x = GlobalMaxPooling1D( name='TextCNN_MaxPool1D_{}'.format(index))(x) concat_out.append(x) x = Concatenate(axis=1)(concat_out) x = Dropout(self.keep_prob)(x) # 最后就是softmax dense_layer = Dense(self.label, activation=self.activation)(x) output_layers = [dense_layer] self.model = Model(bert_inputs, output_layers) def build_model_r_cnn(self): ######### RCNN ######### # bert embedding bert_inputs, bert_output = KerasBertEmbedding().bert_encode() # rcnn bert_output_emmbed = SpatialDropout1D(rate=self.keep_prob)(bert_output) if args.use_lstm: if args.use_cudnn_cell: layer_cell = CuDNNLSTM else: layer_cell = LSTM else: if args.use_cudnn_cell: layer_cell = CuDNNGRU else: layer_cell = GRU x = Bidirectional( layer_cell(units=args.units, return_sequences=args.return_sequences, kernel_regularizer=regularizers.l2(args.l2 * 0.1), recurrent_regularizer=regularizers.l2( args.l2)))(bert_output_emmbed) x = Dropout(args.keep_prob)(x) x = Conv1D(filters=int(self.embedding_dim / 2), kernel_size=2, padding='valid', kernel_initializer='normal', activation='relu')(x) x = GlobalMaxPooling1D()(x) x = Dropout(args.keep_prob)(x) # 最后就是softmax dense_layer = Dense(self.label, activation=self.activation)(x) output_layers = [dense_layer] self.model = Model(bert_inputs, output_layers) def build_model_avt_cnn(self): #########text-cnn######### # bert embedding bert_inputs, bert_output = KerasBertEmbedding().bert_encode() # text cnn bert_output_emmbed = SpatialDropout1D(rate=self.keep_prob)(bert_output) concat_x = [] concat_y = [] concat_z = [] for index, filter_size in enumerate(self.filters): conv = Conv1D(name='TextCNN_Conv1D_{}'.format(index), filters=int(self.embedding_dim / 2), kernel_size=self.filters[index], padding='valid', kernel_initializer='normal', activation='relu')(bert_output_emmbed) x = GlobalMaxPooling1D( name='TextCNN_MaxPooling1D_{}'.format(index))(conv) y = GlobalAveragePooling1D( name='TextCNN_AveragePooling1D_{}'.format(index))(conv) z = AttentionWeightedAverage( name='TextCNN_Annention_{}'.format(index))(conv) concat_x.append(x) concat_y.append(y) concat_z.append(z) merge_x = Concatenate(axis=1)(concat_x) merge_y = Concatenate(axis=1)(concat_y) merge_z = Concatenate(axis=1)(concat_z) merge_xyz = Concatenate(axis=1)([merge_x, merge_y, merge_z]) x = Dropout(self.keep_prob)(merge_xyz) # 最后就是softmax dense_layer = Dense(self.label, activation=self.activation)(x) output_layers = [dense_layer] self.model = Model(bert_inputs, output_layers) def compile_model(self): self.model.compile(optimizer=args.optimizers, loss=categorical_crossentropy, metrics=args.metrics) def callback(self): c_b = [ ModelCheckpoint(args.path_save_model, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='min'), EarlyStopping(min_delta=1e-9, patience=4, mode='min') ] return c_b def fit(self, x_train, y_train, x_dev, y_dev): self.model.fit(x_train, y_train, batch_size=args.batch_size, epochs=args.epochs, validation_data=(x_dev, y_dev), shuffle=True, callbacks=self.callback()) self.model.save(args.path_save_model) def load_model(self): print("BertBiLstmModel load_model start!") # logger.info("BertBiLstmModel load_model start!") self.model.load_weights(args.path_save_model) # logger.info("BertBiLstmModel load_model end+!") print("BertBiLstmModel load_model end+!") def process_pair(self, textss): # 文本预处理,传入一个list,返回的是ids\mask\type-ids input_ids = [] input_masks = [] input_type_ids = [] for texts in textss: tokens_text = self.tokenizer.tokenize(texts[0]) logger.info('Tokens1:', tokens_text) tokens_text2 = self.tokenizer.tokenize(texts[1]) logger.info('Tokens2:', tokens_text2) input_id, input_type_id = self.tokenizer.encode( first=texts[0], second=texts[1], max_len=self.max_seq_len) input_mask = [0 if ids == 0 else 1 for ids in input_id] input_ids.append(input_id) input_type_ids.append(input_type_id) input_masks.append(input_mask) # numpy处理list input_ids = np.array(input_ids) input_masks = np.array(input_masks) input_type_ids = np.array(input_type_ids) logger.info("process ok!") return input_ids, input_masks, input_type_ids def predict(self, sen_1, sen_2): input_ids, input_masks, input_type_ids = self.process_pair( [[sen_1, sen_2]]) return self.model.predict([input_ids, input_masks], batch_size=1) def predict_list(self, questions): label_preds = [] for questions_pair in questions: input_ids, input_masks, input_type_ids = self.process_pair( [questions_pair]) label_pred = self.model.predict([input_ids, input_masks], batch_size=1) label_preds.append(label_pred[0]) return label_preds
model = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=True) model.summary(line_length=120) token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) token_dict_inv = {v: k for k, v in token_dict.items()} tokenizer = Tokenizer(token_dict) text = '数学是利用符号语言研究数量、结构、变化以及空间等概念的一门学科' tokens = tokenizer.tokenize(text) tokens[1] = tokens[2] = '[MASK]' print('Tokens:', tokens) indices = np.array([[token_dict[token] for token in tokens] + [0] * (512 - len(tokens))]) segments = np.array([[0] * len(tokens) + [0] * (512 - len(tokens))]) masks = np.array([[0, 1, 1] + [0] * (512 - 3)]) predicts = model.predict([indices, segments, masks])[0].argmax(axis=-1).tolist() print('Fill with: ', list(map(lambda x: token_dict_inv[x], predicts[0][1:3]))) sentence_1 = '数学是利用符号语言研究數量、结构、变化以及空间等概念的一門学科。' sentence_2 = '从某种角度看屬於形式科學的一種。' print('Tokens:', tokenizer.tokenize(first=sentence_1, second=sentence_2))
class Preprocess: def __init__(self, DATASET, DOMAIN, PAIRS, COLAB, PREPROCESSING): self.MAX_NB_WORDS = 20000 self.VALIDATION_SPLIT = 0.9 self.COLAB = COLAB self.PREPROCESSING = PREPROCESSING self.DIR = '{}data/processed'.format(COLAB) # where will be exported self.DATASET=DATASET self.DOMAIN=DOMAIN self.PAIRS = PAIRS self.nlp = spacy.load('en_core_web_lg') self.bugs = {} self.bugs_saved = [] self.TRAIN_PATH = 'train_chronological' self.TEST_PATH = 'test_chronological' self.MAX_SEQUENCE_LENGTH_T = 50 self.MAX_SEQUENCE_LENGTH_D = 150 self.start() self.tokenizer_init() self.improve_ner(self.nlp) def tokenizer_init(self): pretrained_path = 'uncased_L-12_H-768_A-12' config_path = os.path.join(pretrained_path, 'bert_config.json') model_path = os.path.join(pretrained_path, 'bert_model.ckpt') vocab_path = os.path.join(pretrained_path, 'vocab.txt') token_dict = load_vocabulary(vocab_path) print("Total vocabulary loaded: {}".format(len(token_dict))) self.tokenizer = Tokenizer(token_dict) def start(self): self.ENTITY_ENUM = { '': 'unknown', 'PERSON': 'person', 'NORP': 'nationality', 'FAC': 'facility', 'ORG': 'organization', 'GPE': 'country', 'LOC': 'location', 'PRODUCT': 'product', 'EVENT': 'event', 'WORK_OF_ART': 'artwork', 'LANGUAGE': 'language', 'DATE': 'date', 'TIME': 'time', # 'PERCENT': 'percent', # 'MONEY': 'money', # 'QUANTITY': 'quantity', # 'ORDINAL': 'ordinal', # 'CARDINAL': 'cardinal', 'PERCENT': 'number', 'MONEY': 'number', 'QUANTITY': 'number', 'ORDINAL': 'number', 'CARDINAL': 'number', 'LAW': 'law' } # Keyboards self.keyboards = [u'ctrl', u'CTRL', u'CTRL\+TAB', u'ctrl\+tab', u'ESC', u'Esc', u'esc', u'crtl \+ space', u'CTRL \+ SPACE', u'CTRL + Space', u'CTRL\-C', u'CTRL\-V', u'ctrl\-c', u'ctrl\-v', u'Ctrl-z', u'Ctrl - z', u'CTRL-z', u'Ctrl+z', u'ctrl-z', u'ctrl+z', u'CTRL - z', u'Ctrl + z', u'CTRL+z', u'CTRL+Z', u'CTRL + Z', u'CTRL- Z'] for i in range(0, 13): # Ctrl+number self.keyboards.append(u'CTRL\+{}'.format(i)) self.keyboards.append(u'Ctrl\+{}'.format(i)) self.keyboards.append(u'ctrl\+{}'.format(i)) self.keyboards.append(u'CTRL \+ {}'.format(i)) self.keyboards.append(u'Ctrl \+ {}'.format(i)) self.keyboards.append(u'ctrl \+ {}'.format(i)) self.keyboards.append(u'CTRL\-{}'.format(i)) self.keyboards.append(u'Ctrl\-{}'.format(i)) self.keyboards.append(u'ctrl\-{}'.format(i)) # F+number self.keyboards.append(u'F{}'.format(i)) self.keyboards.append(u'f{}'.format(i)) def expand_contractions(self, text, contractions_dict): contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())), flags=re.IGNORECASE | re.DOTALL) re.compile('({})'.format('|'.join(contractions_dict.keys())), flags=re.IGNORECASE | re.DOTALL) def expand_match(contraction): match = contraction.group(0) first_char = match[0] expanded_contraction = contractions_dict.get(match) \ if contractions_dict.get(match) \ else contractions_dict.get(match.lower()) expanded_contraction = expanded_contraction return expanded_contraction expanded_text = contractions_pattern.sub(expand_match, text) expanded_text = re.sub("'", "", expanded_text) return expanded_text def save_buckets(self, buckets): with open(os.path.join(self.DIR, self.BASE + '_buckets.pkl'), 'wb') as f: pickle.dump(buckets, f) def read_pairs(self, df): bug_pairs = [] bucket_dups = [] bug_ids = set() buckets = self.create_bucket(df) self.save_buckets(buckets) # buckets for key in buckets: if len(buckets[key]) > 1: bucket_dups.append([key, list(buckets[key])]) bug_pairs, bug_ids = self.getting_pairs(bucket_dups) with open(os.path.join(self.DIR, 'bug_pairs.txt'), 'w') as f: for pair in bug_pairs: f.write("{} {}\n".format(pair[0], pair[1])) bug_ids = sorted(bug_ids) with open(os.path.join(self.DIR, 'bug_ids.txt'), 'w') as f: for bug_id in bug_ids: f.write("%d\n" % bug_id) return bug_pairs, bug_ids def split_train_test(self, bug_pairs, VALIDATION_SPLIT): random.shuffle(bug_pairs) split_idx = int(len(bug_pairs) * VALIDATION_SPLIT) with open(os.path.join(self.DIR, '{}.txt'.format(self.TRAIN_PATH)), 'w') as f: for pair in bug_pairs[:split_idx]: f.write("{} {}\n".format(pair[0], pair[1])) test_data = {} for pair in bug_pairs[split_idx:]: bug1 = int(pair[0]) bug2 = int(pair[1]) if bug1 not in test_data: test_data[bug1] = set() test_data[bug1].add(bug2) with open(os.path.join(self.DIR, '{}.txt'.format(self.TEST_PATH)), 'w') as f: for bug in test_data.keys(): f.write("{} {}\n".format(bug, ' '.join([str(x) for x in test_data[bug]]))) print('Train and test created') def func_name_tokenize(self, text): s = [] for i, c in enumerate(text): if c.isupper() and i > 0 and text[i-1].islower(): s.append(' ') s.append(c) return ''.join(s).strip() def improve_ner(self, nlp): # Dates dates = ['sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'] for year in range(2000, 2012): for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Oct', 'Nov', 'Dec']: for day in range(32): dates.append( u'{} {}, {}'.format(day, month, year)) # Steps steps = [] for i in range(15): steps.append(u'{}. '.format(i)) steps.append(u'({}) '.format(i)) steps.append(u'{}) '.format(i)) list_terms = [dates, (u'MacOS', u'MacOS X', u'MacOS x', u'Mac OS X', u'Redhat Linux', u'RedHat Enterprise', u'Linux', u'Windows XP', u'WindowsXP', u'Windows NT', u'Fedora Core', u'Red Hat'), steps ] list_labels = ['DATE', "OS", "STEP INDEX"] self.allow_ner = ['person', 'product', 'time', 'language', 'organization', 'number'] self.allow_ner += [ent.lower() for ent in list_labels] for terms, label in zip(list_terms, list_labels): entity_matcher = EntityMatcher(label, nlp, terms, label) nlp.add_pipe(entity_matcher, after='ner') def ner(self, text): corpus = self.nlp(text) ents, start_char, end_char = [], [], [] ents = [self.ENTITY_ENUM[row.label_] if row.label_ in self.ENTITY_ENUM else row.label_ for row in corpus.ents] starts_char = np.array([row.start_char for row in corpus.ents]) ends_char = np.array([row.end_char for row in corpus.ents]) for index, ent, start_pos, end_pos in zip(range(len(ents)), ents, starts_char, ends_char): if ent.lower() in self.allow_ner: replaced = " {} ".format(ent.lower()) text = text[:start_pos] + replaced + text[end_pos:] diff_replaced = len(replaced) - len(text[start_pos:end_pos]) if diff_replaced > 0: # push starts_char[index+1:] += diff_replaced ends_char[index+1:] += diff_replaced elif diff_replaced < 0: # pull starts_char[index+1:] -= (diff_replaced * -1) ends_char[index+1:] -= (diff_replaced * -1) return text def normalize_text(self, text): if self.PREPROCESSING == 'bert': text = " ".join(self.tokenizer.tokenize(str(text))) else: text = re.sub(r'[0-9]{1,} (min|minutes|minute|m)', 'x time', str(text)) # [0-9] min # Extension files #text = re.sub(r'(WAR|zip|ZIP|css)', 'extension file', text) # extension file #text = re.sub(r'.(zip|txt|java|js|html|php|pdf|exe|doc|jar|xml)', ' extension file', text) # extension file # Memory text = re.sub(r'kB', 'kb', text) # Keyboards text = re.sub(r'('+('|'.join(self.keyboards))+')', 'keyboard', text) # key board # Contraction text=self.expand_contractions(text, contractions_dict) # NER processing text = text[:100000] # limit of spacy lib text = self.ner(text) tokens = re.compile(r'[\W_]+', re.UNICODE).split(text) text = ' '.join([self.func_name_tokenize(token) for token in tokens]) # text = ' '.join(tokens) text = re.sub(r'\d+((\s\d+)+)?', ' ', text) text = [word.lower() for word in nltk.word_tokenize(text)] text = ' '.join([word for word in text]).encode('utf-8') return text def save_dict(self, set, filename): with open(filename, 'w') as f: for i, item in enumerate(set): f.write('%s\t%d\n' % (item, i)) def load_dict(self, filename): dict = {} with open(filename, 'r') as f: for line in f: tokens = line.split('\t') dict[tokens[0]] = tokens[1] return dict def normalized_data(self, bug_ids, df): print("Normalizing text...") products = set() bug_severities = set() priorities = set() versions = set() components = set() bug_statuses = set() text = [] normalized_bugs_json = [] print("Total:", df.shape[0]) res = self.paralelize_processing(df, self.processing_normalized_data, (self.normalize_text, )) for result in res: if self.BASE != 'firefox': products = products.union(result[0]) bug_severities = bug_severities.union(result[1]) priorities = priorities.union(result[2]) versions = versions.union(result[3]) components = components.union(result[4]) bug_statuses = bug_statuses.union(result[5]) text += result[6] normalized_bugs_json += result[7] print("Total of normalized: ", len(normalized_bugs_json)) print("Writing the normalized_bugs.json") with open(os.path.join(self.DIR, 'normalized_bugs.json'), 'w') as f: for row in tqdm(normalized_bugs_json): f.write(row) if self.BASE != 'firefox': self.save_dict(products, os.path.join(self.DIR, 'product.dic')) self.save_dict(bug_severities, os.path.join(self.DIR, 'bug_severity.dic')) self.save_dict(priorities, os.path.join(self.DIR, 'priority.dic')) self.save_dict(versions, os.path.join(self.DIR, 'version.dic')) self.save_dict(components, os.path.join(self.DIR, 'component.dic')) self.save_dict(bug_statuses, os.path.join(self.DIR, 'bug_status.dic')) return text def processing_normalized_data(self, df, normalize_text): products = set() bug_severities = set() priorities = set() versions = set() components = set() bug_statuses = set() text = [] normalized_bugs_json = [] with tqdm(total=df.shape[0]) as loop: for row in df.iterrows(): bug = row[1] if self.BASE != 'firefox': products.add(bug['product']) bug_severities.add(bug['bug_severity']) priorities.add(bug['priority']) versions.add(bug['version']) components.add(bug['component']) bug_statuses.add(bug['bug_status']) if 'description' not in bug or bug['description'] == '': bug['description'] = bug['title'] if 'title' not in bug or bug['title'] == '': bug['title'] = bug['description'] if self.PREPROCESSING == 'bert': description = normalize_text(bug['description']) bug['description_original'] = bug['description'] bug['description'] = description title = normalize_text(bug['title']) bug['title_original'] = bug['title'] bug['title'] = title else: bug['description'] = normalize_text(bug['description']) bug['title'] = normalize_text(bug['title']) normalized_bugs_json.append('{}\n'.format(bug.to_json())) text.append(bug['description']) text.append(bug['title']) loop.update(1) return [products, bug_severities, priorities, versions, components, bug_statuses, text, normalized_bugs_json] def build_vocabulary(self, train_text, MAX_NB_WORDS): word_freq = self.build_freq_dict(train_text) print('word vocabulary') word_vocab = self.save_vocab(word_freq, MAX_NB_WORDS, 'word_vocab_bert.pkl') return word_vocab def build_freq_dict(self, train_text): print('building frequency dictionaries') word_freq = defaultdict(int) for text in tqdm(train_text): for word in text.split(): word_freq[word] += 1 return word_freq def save_vocab(self, freq_dict, vocab_size, filename): top_tokens = sorted(freq_dict.items(), key=lambda x: -x[1])[:vocab_size - 2] print('most common token is %s which appears %d times' % (top_tokens[0][0], top_tokens[0][1])) print('less common token is %s which appears %d times' % (top_tokens[-1][0], top_tokens[-1][1])) vocab = {} i = 2 # 0-index is for padding, 1-index is for UNKNOWN for j in range(len(top_tokens)): vocab[top_tokens[j][0]] = i i += 1 with open(os.path.join(self.DIR, filename), 'wb') as f: pickle.dump(vocab, f) return vocab def load_vocab(self, filename): with open(os.path.join(self.DIR, filename), 'rb') as f: return pickle.load(f) def dump_bugs(self, word_vocab, total): bug_dir = os.path.join(self.DIR, 'bugs') if not os.path.exists(bug_dir): os.mkdir(bug_dir) bugs = [] print("Reading the normalized_bugs.json ...") if self.BASE != 'firefox': product_dict = self.load_dict(os.path.join(self.DIR,'product.dic')) bug_severity_dict = self.load_dict(os.path.join(self.DIR,'bug_severity.dic')) priority_dict = self.load_dict(os.path.join(self.DIR,'priority.dic')) version_dict = self.load_dict(os.path.join(self.DIR,'version.dic')) component_dict = self.load_dict(os.path.join(self.DIR,'component.dic')) bug_status_dict = self.load_dict(os.path.join(self.DIR,'bug_status.dic')) with open(os.path.join(self.DIR, 'normalized_bugs.json'), 'r') as f: #loop = tqdm(f) with tqdm(total=total) as loop: for line in f: bug = json.loads(line) if self.BASE != 'firefox': bug['product'] = product_dict[bug['product']] bug['bug_severity'] = bug_severity_dict[bug['bug_severity']] bug['priority'] = priority_dict[bug['priority']] bug['version'] = version_dict[bug['version']] bug['component'] = component_dict[bug['component']] bug['bug_status'] = bug_status_dict[bug['bug_status']] bugs.append(bug) loop.update(1) return bugs def dump_vocabulary(self, bugs, word_vocab, bug_dir): UNK = 1 cont=0 total = len(bugs) print("Starting the dump ...") bugs_set = {} bugs_saved = [] for bug in tqdm(bugs): #bug = json.loads(line) #print(bug) cont+=1 if self.PREPROCESSING == 'bert': ids, segments = self.tokenizer.encode('' if bug['description_original'] == None else bug['description_original'], max_len=self.MAX_SEQUENCE_LENGTH_D) bug['description_token'] = ids bug['description_segment'] = segments ids, segments = self.tokenizer.encode('' if bug['title_original'] == None else bug['title_original'], max_len=self.MAX_SEQUENCE_LENGTH_T) bug['title_token'] = ids bug['title_segment'] = segments bug.pop('description_original') bug.pop('title_original') else: # BASELINE bug['description_token'] = [word_vocab.get(w.encode('utf-8'), UNK) for w in bug['description'].split()] if len(bug['title']) == 0: bug['title'] = bug['description'][:10] bug['title_token'] = [word_vocab.get(w.encode('utf-8'), UNK) for w in bug['title'].split()] # Save the bug processed bugs_set[bug['issue_id']] = bug with open(os.path.join(bug_dir, str(bug['issue_id']) + '.pkl'), 'wb') as f: pickle.dump(bug, f) bugs_saved.append(bug['issue_id']) return [bugs_set, bugs_saved] def paralelize_processing(self, bugs, callback, parameters): cpu = os.cpu_count() - 1 pool = Pool(processes=cpu) # start N worker processes works = [] n = len(bugs) // cpu n = 1 if n == 0 else n sliced = [] pos_end = n end = len(bugs) for i in range(cpu): pos_end = end if pos_end>=end else pos_end pos_end = end if (i+1) == cpu and pos_end < end else pos_end sliced.append(bugs[i*n:pos_end]) pos_end += n print("Slicing in {} workers".format(len(sliced))) for s in sliced: if len(s) > 0: config = list(parameters) config.insert(0, s) config = tuple(config) works.append(pool.apply_async(callback, config)) #dump_vocabulary(s, bug_dir) print("Executing the works...") res = [w.get() for w in works] return res def processing_dump(self, bugs, word_vocab, bugs_id, bugs_id_dataset): #clear_output() bug_dir = os.path.join(self.DIR, 'bugs') res = self.paralelize_processing(bugs, self.dump_vocabulary, (word_vocab, bug_dir, )) for result in res: bugs_set = result[0] bugs_saved = result[1] for bug in bugs_set: self.bugs[bug] = bugs_set[bug] self.bugs_saved += bugs_saved #self.dump_vocabulary(bugs, word_vocab, bug_dir) self.validing_bugs_id(bugs_id, bugs_id_dataset) print("All done!") def validing_bugs_id(self, bugs_id, bugs_id_dataset): print("Check if all bugs id regirested in the pairs exist in dataset") bugs_invalid = set(bugs_id) - set(bugs_id_dataset) bugs_id_dataset = set(bugs_id_dataset) - bugs_invalid bugs_id_dataset = sorted(bugs_id_dataset) with open(os.path.join(self.DIR, 'bug_ids.txt'), 'w') as f: for bug_id in bugs_id_dataset: f.write("%d\n" % bug_id) print("Bugs not present in dataset: ", list(bugs_invalid)) bug_pairs = [] with open(os.path.join(self.DIR, '{}.txt'.format(self.TRAIN_PATH)), 'r') as f: for line in f: bug1, bug2 = line.strip().split() if bug1 not in bugs_invalid and bug2 not in bugs_invalid: bug_pairs.append([bug1, bug2]) with open(os.path.join(self.DIR, '{}.txt'.format(self.TRAIN_PATH)), 'w') as f: for pairs in bug_pairs: f.write("{} {}\n".format(pairs[0], pairs[1])) def create_bucket(self, df): print("Creating the buckets...") buckets = {} G=nx.Graph() for row in tqdm(df.iterrows()): bug_id = row[1]['issue_id'] dup_id = row[1]['dup_id'] if dup_id == '[]': G.add_node(bug_id) else: G.add_edges_from([(int(bug_id), int(dup_id))]) for g in tqdm(nx.connected_components(G)): group = set(g) for bug in g: master = int(bug) query = df[df['issue_id'] == master] if query.shape[0] <= 0: group.remove(master) master = np.random.choice(list(group), 1) buckets[int(master)] = group return buckets def getting_pairs(self, array): res = [] bug_ids = set() for row in array: dup_bucket, dups = row bug_ids.add(dup_bucket) dups = list(dups) while len(dups) > 1: bucket = dups[0] bug_ids.add(bucket) dups.remove(bucket) for d in dups: bug_ids.add(d) res.append([bucket, d]) return res, bug_ids def run(self): # create 'dataset' directory bug_dir = os.path.join(self.DIR, self.DATASET) if not os.path.exists(bug_dir): os.mkdir(bug_dir) # create 'processing' directory bug_dir = os.path.join(bug_dir, self.PREPROCESSING) if not os.path.exists(bug_dir): os.mkdir(bug_dir) normalized = os.path.join('{}data/normalized'.format(self.COLAB), self.DATASET) self.BASE = self.DOMAIN self.DIR = bug_dir self.DOMAIN = os.path.join(normalized, self.DOMAIN) self.PAIRS = os.path.join(normalized, self.PAIRS) # Train df_train = pd.read_csv('{}.csv'.format(self.DOMAIN)) if self.BASE != 'firefox': df_train.columns = ['issue_id','bug_severity','bug_status','component', 'creation_ts','delta_ts','description','dup_id','priority', 'product','resolution','title','version'] else: df_train.columns = ['issue_id','priority','component','dup_id','title', 'description','bug_status','resolution','version', 'creation_ts', 'delta_ts'] ### Pairs #df_train_pair = pd.read_csv('{}.csv'.format(self.PAIRS)) bug_pairs, bug_ids = self.read_pairs(df_train) bugs_id_dataset = df_train['issue_id'].values print("Number of bugs: {}".format(len(bug_ids))) print("Number of pairs: {}".format(len(bug_pairs))) # Split into train/test self.split_train_test(bug_pairs, self.VALIDATION_SPLIT) # Debug # test = [14785, 24843, 32367, 33529] # df_train = df_train[df_train['issue_id'].isin(test)] # Normalize the text text = self.normalized_data(bug_ids, df_train) # Build the vocab word_vocab = self.build_vocabulary(text, self.MAX_NB_WORDS) # Dump the preprocessed bugs num_lines = len(open(os.path.join(self.DIR, 'normalized_bugs.json'), 'r').read().splitlines()) * 2 total = num_lines // 2 bugs = self.dump_bugs(word_vocab, total) self.processing_dump(bugs, word_vocab, bug_ids, bugs_id_dataset) print("Saved!")