示例#1
0
    def test_uncased(self):
        tokens = [
            '[PAD]', '[UNK]', '[CLS]', '[SEP]', 'want', '##want',
            '##ed', 'wa', 'un', 'runn', '##ing', ',',
            '\u535A', '\u63A8',
        ]
        token_dict = {token: i for i, token in enumerate(tokens)}
        tokenizer = Tokenizer(token_dict)
        text = u"UNwant\u00E9d, running  \nah\u535A\u63A8zzz\u00AD"
        tokens = tokenizer.tokenize(text)
        expected = [
            '[CLS]', 'un', '##want', '##ed', ',', 'runn', '##ing',
            'a', '##h', '\u535A', '\u63A8', 'z', '##z', '##z',
            '[SEP]',
        ]
        self.assertEqual(expected, tokens)
        indices, segments = tokenizer.encode(text)
        expected = [2, 8, 5, 6, 11, 9, 10, 1, 1, 12, 13, 1, 1, 1, 3]
        self.assertEqual(expected, indices)
        expected = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        self.assertEqual(expected, segments)

        decoded = tokenizer.decode(indices)
        expected = [
            'un', '##want', '##ed', ',', 'runn', '##ing',
            '[UNK]', '[UNK]', '\u535A', '\u63A8', '[UNK]', '[UNK]', '[UNK]',
        ]
        self.assertEqual(expected, decoded)
示例#2
0
 def test_empty(self):
     tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]']
     token_dict = {token: i for i, token in enumerate(tokens)}
     tokenizer = Tokenizer(token_dict)
     text = u''
     self.assertEqual(['[CLS]', '[SEP]'], tokenizer.tokenize(text))
     indices, segments = tokenizer.encode(text)
     self.assertEqual([2, 3], indices)
     self.assertEqual([0, 0], segments)
示例#3
0
def bert_sen_token(token_dict, traininstance, maxlen):
    tokenizer = Tokenizer(token_dict)
    train_indices = []
    train_segments = []
    train_text = []
    for text in traininstance:
        tokens = tokenizer.tokenize(text)
        indices, segments = tokenizer.encode(first=text, max_len=maxlen)
        train_indices.append(indices)
        train_segments.append(segments)
        train_text.append(tokens)

    return train_indices, train_segments, train_text
示例#4
0
 def test_cased(self):
     tokens = [
         '[UNK]', u'[CLS]', '[SEP]', 'want', '##want',
         u'##\u00E9d', 'wa', 'UN', 'runn', '##ing', ',',
     ]
     token_dict = {token: i for i, token in enumerate(tokens)}
     tokenizer = Tokenizer(token_dict, cased=True)
     text = u"UNwant\u00E9d, running"
     tokens = tokenizer.tokenize(text)
     expected = ['[CLS]', 'UN', '##want', u'##\u00E9d', ',', 'runn', '##ing', '[SEP]']
     self.assertEqual(expected, tokens)
     indices, segments = tokenizer.encode(text)
     expected = [1, 7, 4, 5, 10, 8, 9, 2]
     self.assertEqual(expected, indices)
     expected = [0, 0, 0, 0, 0, 0, 0, 0]
     self.assertEqual(expected, segments)
示例#5
0
def tokenize(char_seqs, vocab, cased):
    from keras_bert import Tokenizer, TOKEN_CLS, TOKEN_SEP
    tokenizer = Tokenizer(vocab, cased=cased)

    token_seqs = []
    orig2token_maps = []
    for char_seq in char_seqs:
        orig2token_map = [0]
        token_seq = [TOKEN_CLS]
        for c in char_seq:
            orig2token_map.append(len(token_seq))
            tokens = tokenizer.tokenize(c)
            tokens = tokens[1:-1]
            token_seq.extend(tokens)
        orig2token_map.append(len(token_seq))
        token_seq.append(TOKEN_SEP)
        orig2token_maps.append(orig2token_map)
        token_seqs.append(token_seq)

    return token_seqs, orig2token_maps
示例#6
0
class SearchBERT():
    def __init__(self, docs, vec):
        self.texts = np.array(docs)
        self.vec = vec
        paths = get_checkpoint_paths(".")
        inputs = load_trained_model_from_checkpoint(
            config_file=paths.config,
            checkpoint_file=paths.checkpoint,
            seq_len=50)
        outputs = MaskedGlobalMaxPool1D(name='Pooling')(inputs.output)
        self.model = Model(inputs=inputs.inputs, outputs=outputs)
        self.vocab = load_vocabulary(paths.vocab)
        self.tokenizer = Tokenizer(self.vocab)

    def search(self, query, n=5):
        tokens = self.tokenizer.tokenize(" ".join(lemmatize(query)))[:50]
        indices = [self.vocab[token] for token in tokens] + \
            [0 for i in range(50 - len(tokens))]
        segments = [0 for i in range(50)]
        query_vec = self.model.predict(
            [np.array([indices]), np.array([segments])])[0]
        result = np.matmul(self.vec, query_vec)
        idxs = np.argsort(result)[::-1].tolist()[:n]
        return list(zip(self.texts[idxs], result[idxs]))
class BertNerBiLstmModel():
    def __init__(self):
        # logger.info("BertBiLstmModel init start!")
        print("BertNerBiLstmModel init start!")
        self.dict_path, self.max_seq_len, self.keep_prob, self.is_training = vocab_file, args.max_seq_len, args.keep_prob, args.is_training
        # reader tokenizer
        self.token_dict = {}
        with codecs.open(self.dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)

        self.tokenizer = Tokenizer(self.token_dict)
        # 你可以选择一个model build,有bi-lstm single、bi-lstm 3-layers、bi-lstm_attention
        self.build_model_bilstm_layers()
        self.compile_model()
        # self.build_model_bilstm_single()
        # logger.info("BertBiLstmModel init end!")
        print("BertNerBiLstmModel init end!")

    def process_single(self, texts):
        # 文本预处理,传入一个list,返回的是ids\mask\type-ids
        input_ids = []
        input_masks = []
        input_type_ids = []
        for text in texts:
            if type(text) is list:
                text = "".join(text)
            logger.info(text)
            tokens_text = self.tokenizer.tokenize(text)
            logger.info('Tokens:', tokens_text)
            input_id, input_type_id = self.tokenizer.encode(first=text, max_len=self.max_seq_len)
            input_mask = [0 if ids == 0 else 1 for ids in input_id]
            input_ids.append(input_id)
            input_type_ids.append(input_type_id)
            input_masks.append(input_mask)
        # numpy处理list
        input_ids = np.array(input_ids)
        input_masks = np.array(input_masks)
        input_type_ids = np.array(input_type_ids)
        logger.info("process ok!")
        return [input_ids, input_masks, input_type_ids]

    def process_pair(self, textss):
        # 文本预处理,传入一个list,返回的是ids\mask\type-ids
        input_ids = []
        input_masks = []
        input_type_ids = []
        for texts in textss:
            tokens_text = self.tokenizer.tokenize(texts[0])
            logger.info('Tokens1:', tokens_text)
            tokens_text2 = self.tokenizer.tokenize(texts[1])
            logger.info('Tokens2:', tokens_text2)
            input_id, input_type_id = self.tokenizer.encode(first=texts[0], second=texts[1], max_len=self.max_seq_len)
            input_mask = [0 if ids == 0 else 1 for ids in input_id]
            input_ids.append(input_id)
            input_type_ids.append(input_type_id)
            input_masks.append(input_mask)
        # numpy处理list
        input_ids = np.array(input_ids)
        input_masks = np.array(input_masks)
        input_type_ids = np.array(input_type_ids)
        logger.info("process ok!")
        return [input_ids, input_masks, input_type_ids]

    def build_model_bilstm_layers(self):
        if args.use_lstm:
            if args.use_cudnn_cell:
                layer_cell = CuDNNLSTM
            else:
                layer_cell = LSTM
        else:
            if args.use_cudnn_cell:
                layer_cell = CuDNNGRU
            else:
                layer_cell = GRU
        # bert embedding
        bert_inputs, bert_output = KerasBertEmbedding().bert_encode()

        # Bi-LSTM
        x = Bidirectional(layer_cell(units=args.units,
                                     return_sequences=args.return_sequences,
                                     ))(bert_output)
        # 最后
        x = TimeDistributed(Dropout(self.keep_prob))(x)
        dense_layer = Dense(args.max_seq_len, activation=args.activation)(x)
        crf = CRF(args.label, sparse_target=False, learn_mode="join", test_mode='viterbi')
        output_layers = crf(dense_layer)
        self.model = Model(bert_inputs, output_layers)
        self.model.summary(132)

    def compile_model(self):
        self.model.compile(
            optimizer=Adam(lr=args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=args.epsilon, decay=0.0),
            loss=crf_loss if args.use_crf else sparse_categorical_crossentropy,
            metrics=[crf_accuracy] if args.metrics is 'crf_loss' else args.metrics)
        # loss=CRF.loss_function if args.use_crf else categorical_crossentropy,
        # metrics=[CRF.accuracy] if args.metrics is 'crf_loss' else args.metrics)
        # loss=crf.loss if args.use_crf else categorical_crossentropy,
        # metrics=[crf.accuracy] if args.metrics is 'crf_loss' else args.metrics)

    def callback(self):
        cb = [ModelCheckpoint(monitor='val_loss', mode='min', filepath=args.path_save_model, verbose=1, save_best_only=True, save_weights_only=False),
              ReduceLROnPlateau(monitor='val_loss', mode='min', factor=0.2, patience=2, verbose=0, epsilon=1e-6, cooldown=4, min_lr=1e-8),
              EarlyStopping(monitor='val_loss', mode='min', min_delta=1e-8, patience=2)
              ]
        return cb

    def fit(self, x_train, y_train, x_dev, y_dev):
        self.model.fit(x_train, y_train, batch_size=args.batch_size,
                       epochs=args.epochs, validation_data=(x_dev, y_dev),
                       shuffle=True,
                       callbacks=self.callback())
        self.model.save(args.path_save_model)

    def load_model(self):
        print("BertNerBiLstmModel load_model start!")
        # logger.info("BertBiLstmModel load_model start!")
        self.model.load_weights(args.path_save_model)
        # logger.info("BertBiLstmModel load_model end+!")
        print("BertNerBiLstmModel load_model end+!")

    def predict(self, sen):
        input_ids, input_masks, input_type_ids = self.process_single([sen])
        probs = self.model.predict([input_ids, input_masks], batch_size=1)
        probs_first = probs[0]
        preds = []
        for prob_one in probs_first:
            prob_max = np.argmax(prob_one)
            preds.append(prob_max)
        return preds

    def predict_list(self, questions):
        label_preds = []
        for questions_pair in questions:
            input_ids, input_masks, input_type_ids = self.process_single([questions_pair])
            label_pred = self.model.predict([input_ids, input_masks], batch_size=1)
            label_preds.append(label_pred)
        return label_preds
示例#8
0
#构建字典
token_dict = load_vocabulary(vocab_path)
print(token_dict)
print(len(token_dict))

#Tokenization
tokenizer = Tokenizer(token_dict)
print(tokenizer)

#加载预训练模型
model = load_trained_model_from_checkpoint(config_path, checkpoint_path)
print(model)

#-------------------------------第二步 特征提取---------------------------------
text = '语言模型'
tokens = tokenizer.tokenize(text)
print(tokens)
#['[CLS]', '语', '言', '模', '型', '[SEP]']

indices, segments = tokenizer.encode(first=text, max_len=512)
print(indices[:10])
print(segments[:10])

#提取特征
predicts = model.predict([np.array([indices]), np.array([segments])])[0]
for i, token in enumerate(tokens):
    print(token, predicts[i].tolist()[:5])
print("")

#----------------------------第三步 多句子特征提取------------------------------
text1 = '语言模型'
class PassageTagger(object):
    def __init__(self, params):
        self.params = params
        self.input_size = 768
        self.tagger = None
        self.maxclauselen = None
        self.maxseqlen = None
        pretrained_path = self.params["repfile"]
        config_path = os.path.join(pretrained_path, 'bert_config.json')
        checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
        vocab_path = os.path.join(pretrained_path, 'vocab.txt')

        self.bert = load_trained_model_from_checkpoint(config_path,
                                                       checkpoint_path)
        self.bert._make_predict_function(
        )  # Crucial step, otherwise TF will give error.

        token_dict = {}
        with codecs.open(vocab_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                token_dict[token] = len(token_dict)
        self.tokenizer = Tokenizer(token_dict)

    def make_data(self,
                  trainfilename,
                  maxseqlen=None,
                  maxclauselen=None,
                  label_ind=None,
                  train=False):
        use_attention = self.params["use_attention"]
        batch_size = self.params["batch_size"]

        str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train)
        print("Filtering data")
        str_seqs = clean_words(str_seqs)
        label_seqs = to_BIO(label_seqs)
        if not label_ind:
            self.label_ind = {"none": 0}
        else:
            self.label_ind = label_ind
        seq_lengths = [len(seq) for seq in str_seqs]
        if self.maxseqlen is None:
            if maxseqlen:
                self.maxseqlen = maxseqlen
            elif self.params["maxseqlen"] is not None:
                self.maxseqlen = self.params["maxseqlen"]
            else:
                self.maxseqlen = max(seq_lengths)
        if self.maxclauselen is None:
            if maxclauselen:
                self.maxclauselen = maxclauselen
            elif self.params["maxclauselen"] is not None:
                self.maxclauselen = self.params["maxclauselen"]
            elif use_attention:
                sentence_lens = []
                for str_seq in str_seqs:
                    for seq in str_seq:
                        tokens = self.tokenizer.tokenize(seq.lower())
                        sentence_lens.append(len(tokens))
                self.maxclauselen = np.round(
                    np.mean(sentence_lens) +
                    3 * np.std(sentence_lens)).astype(int)

        if len(self.label_ind) <= 1:
            for str_seq, label_seq in zip(str_seqs, label_seqs):
                for label in label_seq:
                    if label not in self.label_ind:
                        # Add new labels with values 0,1,2,....
                        self.label_ind[label] = len(self.label_ind)
        self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()}
        discourse_generator = BertDiscourseGenerator(self.bert, self.tokenizer,
                                                     str_seqs, label_seqs,
                                                     self.label_ind,
                                                     batch_size, use_attention,
                                                     self.maxseqlen,
                                                     self.maxclauselen, train)
        return seq_lengths, discourse_generator  # One-hot representation of labels

    def predict(self, discourse_generator, test_seq_lengths=None, tagger=None):
        if not tagger:
            tagger = self.tagger
        if test_seq_lengths is None:
            assert (False)
        else:
            x_lens = test_seq_lengths
        pred_probs = tagger.predict_generator(discourse_generator)
        pred_inds = np.argmax(pred_probs, axis=2)
        pred_label_seqs = []
        for pred_ind, x_len in zip(pred_inds, x_lens):
            pred_label_seq = [self.rev_label_ind[pred]
                              for pred in pred_ind][-x_len:]
            # If the following number is positive, it means we ignored some clauses in the test passage to make it the same length as the ones we trained on.
            num_ignored_clauses = max(0, x_len - len(pred_label_seq))
            # Make labels for those if needed.
            if num_ignored_clauses > 0:
                warnings.warn(
                    "Test sequence too long. Ignoring %d clauses at the beginning and labeling them none."
                    % num_ignored_clauses)
                ignored_clause_labels = ["none"] * num_ignored_clauses
                pred_label_seq = ignored_clause_labels + pred_label_seq
            pred_label_seqs.append(pred_label_seq)
        return pred_probs, pred_label_seqs, x_lens

    def fit_model(self, train_generator, validation_generator, reg=0):
        use_attention = self.params["use_attention"]
        att_context = self.params["att_context"]
        lstm = self.params["lstm"]
        bidirectional = self.params["bidirectional"]
        crf = self.params["crf"]
        embedding_dropout = self.params["embedding_dropout"]
        high_dense_dropout = self.params["high_dense_dropout"]
        attention_dropout = self.params["attention_dropout"]
        lstm_dropout = self.params["lstm_dropout"]
        word_proj_dim = self.params["word_proj_dim"]
        lr = self.params["lr"]
        epoch = self.params["epoch"]
        batch_size = self.params["batch_size"]
        hard_k = self.params["hard_k"]
        att_proj_dim = self.params["att_proj_dim"]
        rec_hid_dim = self.params["rec_hid_dim"]
        lstm_dim = self.params["lstm_dim"]
        validation_split = self.params["validation_split"]
        early_stopping = EarlyStopping(patience=2)
        num_classes = len(self.label_ind)

        # Load discourse tagger
        model_config_file = open(
            "scidt_scibert/model_att=True_cont=LSTM_clause_lstm=False_bi=True_crf=True_config.json",
            "r")
        model_weights_file_name = "scidt_scibert/model_att=True_cont=LSTM_clause_lstm=False_bi=True_crf=True_weights"
        cached_tagger = model_from_json(model_config_file.read(),
                                        custom_objects={
                                            "TensorAttention": TensorAttention,
                                            "HigherOrderTimeDistributedDense":
                                            HigherOrderTimeDistributedDense,
                                            "CRF": CRF
                                        })
        cached_tagger.load_weights(model_weights_file_name)

        for l in cached_tagger.layers:
            l.trainable = True
        inputs = cached_tagger.input
        x = cached_tagger.layers[-2].output

        if crf:
            Crf = CRF(num_classes, learn_mode="join")
            discourse_prediction = Crf(x)
            tagger = Model(inputs=inputs, outputs=[discourse_prediction])
        else:
            discourse_prediction = TimeDistributed(Dense(num_classes,
                                                         activation='softmax'),
                                                   name='discourse')(x)
            tagger = Model(inputs=inputs, outputs=[discourse_prediction])

        def step_decay(current_epoch):
            initial_lrate = lr
            drop = 0.5
            epochs_drop = epoch / 2
            lrate = initial_lrate * np.power(
                drop, np.floor((1 + current_epoch) / epochs_drop))
            return lrate

        lr_fractions = [1]
        decay = 0

        adam = Adam(lr=lr, decay=decay)
        if crf:
            #rmsprop = RMSprop(lr=lr,decay = decay)
            tagger.compile(optimizer=adam,
                           loss=Crf.loss_function,
                           metrics=[Crf.accuracy])
        else:
            tagger.compile(loss='categorical_crossentropy',
                           optimizer=adam,
                           metrics=['accuracy'])
        tagger.summary()
        tagger.fit_generator(train_generator,
                             validation_data=validation_generator,
                             epochs=epoch,
                             callbacks=[early_stopping],
                             verbose=2)

        #for l in cached_tagger.layers:
        #    l.trainable = True

        #if crf:
        #    #rmsprop = RMSprop(lr=lr,decay = decay)
        #    tagger.compile(optimizer=adam, loss=Crf.loss_function, metrics=[Crf.accuracy])
        #else:
        #    tagger.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
        #tagger.summary()
        #tagger.fit_generator(train_generator, validation_data=validation_generator, epochs=epoch, callbacks=[early_stopping], verbose=2)
        return tagger

    def train(self, train_generator, validation_generator):
        save = self.params["save"]

        f_mean, f_std, original_f_mean, original_f_std = 0, 0, 0, 0

        self.tagger = self.fit_model(train_generator, validation_generator)
        if save:
            model_ext = "att=%s_cont=%s_lstm=%s_bi=%s_crf=%s" % (
                str(self.params["use_attention"]), self.params["att_context"],
                str(self.params["lstm"]), str(
                    self.params["bidirectional"]), str(self.params["crf"]))
            model_config_file = open("model_%s_config.json" % model_ext, "w")
            model_weights_file_name = "model_%s_weights" % model_ext
            model_label_ind = "model_%s_label_ind.json" % model_ext
            print(self.tagger.to_json(), file=model_config_file)
            self.tagger.save_weights(model_weights_file_name, overwrite=True)
            json.dump(self.label_ind, open(model_label_ind, "w"))
        return f_mean, f_std, original_f_mean, original_f_std
示例#10
0
from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam
from keras_bert import load_trained_model_from_checkpoint, Tokenizer, get_model

bert_model = load_trained_model_from_checkpoint(config_path,
                                                checkpoint_path,
                                                seq_len=None)
for l in bert_model.layers:
    l.trainable = True

x1_in = Input(shape=(None, ))
x2_in = Input(shape=(None, ))

x = bert_model([x1_in, x2_in])

# Tokenization
from keras_bert import Tokenizer

tokenizer = Tokenizer(token_dict)
# text = '语言模型 chinese is great'
# text='商品名称及规格型号'
# text='境外收货人\nDERCOCHILEREPUESTOSS.A.'
# text='合同协议号\n2019CICSA473-A'
text = '运抵国(地区)\n智利'
tokens = tokenizer.tokenize(text)
# ['[CLS]', '语', '言', '模', '型', '[SEP]']
print('tokens', tokens)
indices, segments = tokenizer.encode(first=text, max_len=512)
print(indices[:10])
示例#11
0
class Embeddings(object):
    def __init__(self,
                 name,
                 path='./embedding-registry.json',
                 lang='en',
                 extension='vec',
                 use_ELMo=False,
                 use_BERT=False,
                 use_cache=True,
                 load=True):
        self.name = name
        self.embed_size = 0
        self.static_embed_size = 0
        self.vocab_size = 0
        self.model = {}
        self.registry = self._load_embedding_registry(path)
        self.lang = lang
        self.extension = extension
        self.embedding_lmdb_path = None
        if self.registry is not None:
            self.embedding_lmdb_path = self.registry["embedding-lmdb-path"]
        self.env = None
        if load:
            self.make_embeddings_simple(name)
        self.static_embed_size = self.embed_size
        self.bilm = None

        self.use_cache = use_cache
        # below init for using ELMo embeddings
        self.use_ELMo = use_ELMo
        if use_ELMo:
            self.make_ELMo()
            self.embed_size = ELMo_embed_size + self.embed_size
            description = self.get_description('elmo-' + self.lang)
            self.env_ELMo = None
            if description and description["cache-training"] and self.use_cache:
                self.embedding_ELMo_cache = os.path.join(
                    description["path-cache"], "cache")
                # clean possible remaining cache
                self.clean_ELMo_cache()
                # create and load a cache in write mode, it will be used only for training
                self.env_ELMo = lmdb.open(self.embedding_ELMo_cache,
                                          map_size=map_size)

        # below init for using BERT embeddings (extracted features only, not fine tuning),
        # similar to ELMo for this usage
        self.use_BERT = use_BERT
        if use_BERT:
            # to avoid issue with tf graph and thread, we maintain in the class its own graph and session
            #self.session = tf.Session()
            self.graph = tf.get_default_graph()
            #self.session.run(tf.global_variables_initializer())
            self.make_BERT()
            self.embed_size = BERT_embed_size + self.embed_size
            description = self.get_description('bert-base-' + self.lang)
            self.env_BERT = None
            if description and description["cache-training"] and self.use_cache:
                self.embedding_BERT_cache = os.path.join(
                    description["path-cache"], "cache")
                # clean possible remaining cache
                self.clean_BERT_cache()
                # create and load a cache in write mode, it will be used only for training
                self.env_BERT = lmdb.open(self.embedding_BERT_cache,
                                          map_size=map_size)

    def __getattr__(self, name):
        return getattr(self.model, name)

    def _load_embedding_registry(self, path='./embedding-registry.json'):
        """
        Load the description of available embeddings. Each description provides a name, 
        a file path (used only if necessary) and a embeddings type (to take into account
        small variation of format)
        """
        registry_json = open(path).read()
        return json.loads(registry_json)

    def make_embeddings_simple_in_memory(self, name="fasttext-crawl"):
        nbWords = 0
        print('loading embeddings...')
        begin = True
        description = self.get_description(name)
        if description is not None:
            embeddings_path = description["path"]
            self.lang = description["lang"]
            print("path:", embeddings_path)
            if self.extension == 'bin':
                self.model = fastText.load_model(embeddings_path)
                nbWords = len(self.model.get_words())
                self.embed_size = self.model.get_dimension()
            else:
                with open(embeddings_path, encoding='utf8') as f:
                    for line in f:
                        line = line.strip()
                        line = line.split(' ')
                        if begin:
                            begin = False
                            nb_words, embed_size = _fetch_header_if_available(
                                line)

                            # we parse the header
                            if nb_words > 0 and embed_size > 0:
                                nbWords = nb_words
                                self.embed_size = embed_size
                                continue

                        word = line[0]
                        vector = np.array(
                            [float(val) for val in line[1:len(line)]],
                            dtype='float32')
                        #else:
                        #    vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32')
                        if self.embed_size == 0:
                            self.embed_size = len(vector)
                        self.model[word] = vector
                if nbWords == 0:
                    nbWords = len(self.model)
            print('embeddings loaded for', nbWords, "words and",
                  self.embed_size, "dimensions")

    def make_embeddings_lmdb(self, name="fasttext-crawl"):
        print(
            '\nCompiling embeddings... (this is done only one time per embeddings at first usage)'
        )
        description = self.get_description(name)

        if description is None:
            print(
                '\nNo description found in embeddings registry for embeddings',
                name)
            return

        if description is not None:
            # the following method will possibly download the mebedding file if not available locally
            embeddings_path = self.get_embedding_path(description)
            if embeddings_path is None:
                print('\nCould not locate a usable resource for embeddings',
                      name)
                return

            self.load_embeddings_from_file(embeddings_path)

        # cleaning possible downloaded embeddings
        self.clean_downloads()

    def load_embeddings_from_file(self, embeddings_path):
        begin = True
        nbWords = 0
        txn = self.env.begin(write=True)
        # batch_size = 1024
        i = 0
        nb_lines = 0

        # read number of lines first
        embedding_file = open_embedding_file(embeddings_path)
        if embedding_file is None:
            print("Error: could not open embeddings file", embeddings_path)
            return

        for line in embedding_file:
            nb_lines += 1
        embedding_file.close()

        embedding_file = open_embedding_file(embeddings_path)
        #with open(embeddings_path, encoding='utf8') as f:
        for line in tqdm(embedding_file, total=nb_lines):
            line = line.decode()
            line = line.split(' ')
            if begin:
                begin = False
                nb_words, embed_size = _fetch_header_if_available(line)

                if nb_words > 0 and embed_size > 0:
                    nbWords = nb_words
                    self.embed_size = embed_size
                    continue

            word = line[0]
            try:
                if line[len(line) - 1] == '\n':
                    vector = np.array(
                        [float(val) for val in line[1:len(line) - 1]],
                        dtype='float32')
                else:
                    vector = np.array(
                        [float(val) for val in line[1:len(line)]],
                        dtype='float32')

                #vector = np.array([float(val) for val in line[1:len(line)]], dtype='float32')
            except:
                print(len(line))
                print(line[1:len(line)])
            #else:
            #    vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32')
            if self.embed_size == 0:
                self.embed_size = len(vector)

            if len(word.encode(encoding='UTF-8')) < self.env.max_key_size():
                txn.put(word.encode(encoding='UTF-8'),
                        _serialize_pickle(vector))
                #txn.put(word.encode(encoding='UTF-8'), _serialize_byteio(vector))
                i += 1

            # commit batch
            # if i % batch_size == 0:
            #     txn.commit()
            #     txn = self.env.begin(write=True)

        embedding_file.close()

        #if i % batch_size != 0:
        txn.commit()
        if nbWords == 0:
            nbWords = i
        self.vocab_size = nbWords
        print('embeddings loaded for', nbWords, "words and", self.embed_size,
              "dimensions")

    def clean_downloads(self):
        # cleaning possible downloaded embeddings
        for filename in os.listdir(self.registry['embedding-download-path']):
            file_path = os.path.join(self.registry['embedding-download-path'],
                                     filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print('Failed to delete %s. Reason: %s' % (file_path, e))

    def make_embeddings_simple(self, name="fasttext-crawl"):
        description = self.get_description(name)
        if description is not None:
            self.extension = description["format"]

        if self.extension == "bin":
            if fasttext_support == True:
                print(
                    "embeddings are of .bin format, so they will be loaded in memory..."
                )
                self.make_embeddings_simple_in_memory(name)
            else:
                if not (sys.platform == 'linux' or sys.platform == 'darwin'):
                    raise ValueError(
                        'FastText .bin format not supported for your platform')
                else:
                    raise ValueError(
                        'Go to the documentation to get more information on how to install FastText .bin support'
                    )

        elif self.embedding_lmdb_path is None or self.embedding_lmdb_path == "None":
            print(
                "embedding_lmdb_path is not specified in the embeddings registry, so the embeddings will be loaded in memory..."
            )
            self.make_embeddings_simple_in_memory(name)
        else:
            # if the path to the lmdb database files does not exist, we create it
            if not os.path.isdir(self.embedding_lmdb_path):
                # conservative check (likely very useless)
                if not os.path.exists(self.embedding_lmdb_path):
                    os.makedirs(self.embedding_lmdb_path)

            # check if the lmdb database exists
            envFilePath = os.path.join(self.embedding_lmdb_path, name)
            load_db = True
            if os.path.isdir(envFilePath):
                description = self.get_description(name)
                if description is not None:
                    self.lang = description["lang"]

                # open the database in read mode
                self.env = lmdb.open(envFilePath,
                                     readonly=True,
                                     max_readers=2048,
                                     max_spare_txns=4)
                if self.env:
                    # we need to set self.embed_size and self.vocab_size
                    with self.env.begin() as txn:
                        stats = txn.stat()
                        size = stats['entries']
                        self.vocab_size = size

                    with self.env.begin() as txn:
                        cursor = txn.cursor()
                        for key, value in cursor:
                            vector = _deserialize_pickle(value)
                            self.embed_size = vector.shape[0]
                            break
                        cursor.close()

                    if self.vocab_size > 100 and self.embed_size > 10:
                        # lmdb database exists and looks valid
                        load_db = False

                        # no idea why, but we need to close and reopen the environment to avoid
                        # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot
                        # when opening new transaction !
                        self.env.close()
                        self.env = lmdb.open(envFilePath,
                                             readonly=True,
                                             max_readers=2048,
                                             max_spare_txns=2)

            if load_db:
                # create and load the database in write mode
                self.env = lmdb.open(envFilePath, map_size=map_size)
                self.make_embeddings_lmdb(name)

    def make_ELMo(self):
        # Location of pretrained BiLM for the specified language
        # TBD check if ELMo language resources are present
        description = self.get_description('elmo-' + self.lang)
        if description is not None:
            self.lang = description["lang"]
            vocab_file = description["path-vocab"]
            options_file = description["path-config"]
            weight_file = description["path_weights"]

            print('init ELMo')

            # Create a Batcher to map text to character ids
            self.batcher = Batcher(vocab_file, 50)

            # Build the biLM graph.
            self.bilm = BidirectionalLanguageModel(self.lang, options_file,
                                                   weight_file)

            # Input placeholders to the biLM.
            self.character_ids = tf.placeholder('int32',
                                                shape=(None, None, 50))

            with tf.variable_scope(self.lang, reuse=tf.AUTO_REUSE):
                # the reuse=True scope reuses weights from the whole context
                self.embeddings_op = self.bilm(self.character_ids)
                self.elmo_input = weight_layers('input',
                                                self.embeddings_op,
                                                l2_coef=0.0)

    def make_BERT(self):
        # Location of BERT model
        description = self.get_description('bert-base-' + self.lang)
        if description is not None:
            self.lang = description["lang"]
            config_file = description["path-config"]
            weight_file = description["path-weights"]
            vocab_file = description["path-vocab"]

            print('init BERT')

            # load the pretrained model
            with self.graph.as_default():
                # there are different typical pooling strategies for getting BERT features:
                # - concatenation of 4 last layers (the one from the original BERT paper, BERT_embed_size is then 3072)
                # - last layer (BERT_embed_size is 768)
                # - average of 4 last layers (BERT_embed_size is 768)
                # - sum of the 4 last layers (BERT_embed_size is 768)
                self.bert_model = load_trained_model_from_checkpoint(
                    config_file, weight_file, output_layer_num=4)
                self.bert_model.summary(line_length=120)
                self.bert_model._make_predict_function()

            # init the tokenizer
            token_dict = {}
            with codecs.open(vocab_file, 'r', 'utf8') as reader:
                for line in reader:
                    token = line.strip()
                    token_dict[token] = len(token_dict)
            print('token_dict size:', len(token_dict))
            self.bert_tokenizer = Tokenizer(token_dict, cased=True)

    def get_sentence_vector_only_ELMo(self, token_list):
        """
            Return the ELMo embeddings only for a full sentence
        """

        if not self.use_ELMo:
            print(
                "Warning: ELMo embeddings requested but embeddings object wrongly initialised"
            )
            return

        # Create batches of data
        local_token_ids = self.batcher.batch_sentences(token_list)
        max_size_sentence = local_token_ids[0].shape[0]
        # check lmdb cache
        elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence)
        if elmo_result is not None:
            return elmo_result

        with tf.Session() as sess:
            # weird, for this cpu is faster than gpu (1080Ti !)
            with tf.device("/cpu:0"):
                # It is necessary to initialize variables once before running inference
                sess.run(tf.global_variables_initializer())

                # Compute ELMo representations (2 times as a heavy warm-up)
                elmo_result = sess.run(
                    self.elmo_input['weighted_op'],
                    feed_dict={self.character_ids: local_token_ids})
                elmo_result = sess.run(
                    self.elmo_input['weighted_op'],
                    feed_dict={self.character_ids: local_token_ids})
                #cache computation
                self.cache_ELMo_lmdb_vector(token_list, elmo_result)
        return elmo_result

    def get_sentence_vector_with_ELMo(self, token_list):
        """
            Return a concatenation of standard embeddings (e.g. Glove) and ELMo embeddings 
            for a full sentence
        """
        if not self.use_ELMo:
            print(
                "Warning: ELMo embeddings requested but embeddings object wrongly initialised"
            )
            return

        #print("\ntoken_list:", token_list)
        local_token_ids = self.batcher.batch_sentences(token_list)
        #print("local_token_ids:", local_token_ids)
        max_size_sentence = local_token_ids[0].shape[0]

        elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence)
        if elmo_result is None:
            with tf.Session() as sess:
                # weird, for this cpu is faster than gpu (1080Ti !)
                with tf.device("/cpu:0"):
                    # It is necessary to initialize variables once before running inference
                    sess.run(tf.global_variables_initializer())

                    # Compute ELMo representations (2 times as a heavy warm-up)
                    elmo_result = sess.run(
                        self.elmo_input['weighted_op'],
                        feed_dict={self.character_ids: local_token_ids})
                    elmo_result = sess.run(
                        self.elmo_input['weighted_op'],
                        feed_dict={self.character_ids: local_token_ids})
                    #cache computation
                    self.cache_ELMo_lmdb_vector(token_list, elmo_result)

        concatenated_result = np.zeros(
            (len(token_list), max_size_sentence - 2, self.embed_size),
            dtype=np.float32)
        #concatenated_result = np.random.rand(elmo_result.shape[0], max_size_sentence-2, self.embed_size)
        for i in range(0, len(token_list)):
            for j in range(0, len(token_list[i])):
                #if is_int(token_list[i][j]) or is_float(token_list[i][j]):
                #dummy_result = np.zeros((elmo_result.shape[2]), dtype=np.float32)
                #concatenated_result[i][j] = np.concatenate((dummy_result, self.get_word_vector(token_list[i][j])), )
                #else:
                concatenated_result[i][j] = np.concatenate(
                    (elmo_result[i][j], self.get_word_vector(
                        token_list[i][j]).astype('float32')), )
                #concatenated_result[i][j] = np.concatenate((self.get_word_vector(token_list[i][j]), elmo_result[i][j]), )
        return concatenated_result

    def get_sentence_vector_only_BERT(self, token_list):
        """
            Return the BERT extracted embeddings only for a full sentence
        """
        if not self.use_BERT:
            print(
                "Warning: BERT embeddings requested but embeddings object wrongly initialised"
            )
            return

        #print("local_token_ids:", local_token_ids)
        max_size_token_list = 0
        for i, sentence in enumerate(token_list):
            if len(sentence) > max_size_token_list:
                max_size_token_list = len(sentence)

        # retokenize with BERT tokenizer
        max_size = BERT_sentence_size
        max_size_sentence = 0
        new_token_list = []
        bert_results = np.zeros((len(token_list), max_size, BERT_embed_size),
                                dtype=np.float32)
        for i, sentence in enumerate(token_list):
            local_text = " ".join(sentence)
            local_tokens = self.bert_tokenizer.tokenize(local_text)

            bert_result = self.get_BERT_lmdb_vector(sentence)
            if bert_result is None:
                indices, segments = self.bert_tokenizer.encode(
                    local_text, max_len=max_size)
                with self.graph.as_default():
                    bert_result = self.bert_model.predict(
                        [np.array([indices]),
                         np.array([segments])])[0]
                    #cache computation
                    if bert_result is not None:
                        self.cache_BERT_lmdb_vector(sentence, bert_result)

            # Realign BERT tokenization with the provided tokenization. Normally BERT segmenter always
            # over-segment as compared to DeLFT segmenter.
            # There are two obvious possibilities to combine subtoken embeddings into token embeddings,
            # either take the embeddings of the last subtoken, of use the average vector of the subtokens.
            new_bert_result = np.zeros((max_size, BERT_embed_size),
                                       dtype=np.float32)
            token_tensor = []
            tid = 0
            buffer = ''
            #print(sentence)
            #print(local_tokens)
            for j, t in enumerate(local_tokens):
                if j >= max_size:
                    break
                if t == '[CLS]' or t == '[SEP]':
                    continue
                else:
                    if t.startswith('##'):
                        t = t[2:]
                    buffer += t
                    #print(buffer)
                    token_tensor.append(bert_result[j])
                    if buffer == sentence[tid]:
                        # average vector of the subtokens
                        new_bert_result[tid] = np.stack(token_tensor).mean(
                            axis=0)
                        # or last subtoken vector
                        #new_bert_result[tid] = token_tensor[-1]
                        token_tensor = []
                        buffer = ''
                        tid += 1
            bert_result = new_bert_result

            if bert_result is not None:
                bert_results[i] = bert_result

        # we need to squeze the vector to max_size_token_list
        squeezed_bert_results = np.zeros(
            (len(token_list), max_size_token_list, BERT_embed_size),
            dtype=np.float32)
        for i, sentence in enumerate(token_list):
            squeezed_bert_results[i] = bert_results[i][:max_size_token_list]

        return squeezed_bert_results

    def get_sentence_vector_with_BERT(self, token_list):
        """
            Return a concatenation of standard embeddings (e.g. Glove) and BERT extracted embeddings  
            for a full sentence
        """
        if not self.use_BERT:
            print(
                "Warning: BERT embeddings requested but embeddings object wrongly initialised"
            )
            return

        max_size_token_list = 0
        for i, sentence in enumerate(token_list):
            if len(sentence) > max_size_token_list:
                max_size_token_list = len(sentence)

        squeezed_bert_results = self.get_sentence_vector_only_BERT(token_list)

        concatenated_squeezed_result = np.zeros(
            (len(token_list), max_size_token_list, self.embed_size),
            dtype=np.float32)
        for i, sentence in enumerate(token_list):
            for j in range(0, len(token_list[i])):
                concatenated_squeezed_result[i][j] = np.concatenate(
                    (squeezed_bert_results[i][j],
                     self.get_word_vector(
                         token_list[i][j]).astype('float32')), )

        return concatenated_squeezed_result

    def get_description(self, name):
        for emb in self.registry["embeddings"]:
            if emb["name"] == name:
                return emb
        for emb in self.registry["embeddings-contextualized"]:
            if emb["name"] == name:
                return emb
        for emb in self.registry["transformers"]:
            if emb["name"] == name:
                return emb
        return None

    def get_word_vector(self, word):
        """
            Get static embeddings (e.g. glove) for a given token
        """
        if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'):
            # the pre-trained embeddings are not cased
            word = word.lower()
        if self.env is None or self.extension == 'bin':
            # db not available or embeddings in bin format, the embeddings should be available in memory (normally!)
            return self.get_word_vector_in_memory(word)
        try:
            with self.env.begin() as txn:
                vector = txn.get(word.encode(encoding='UTF-8'))
                if vector:
                    word_vector = _deserialize_pickle(vector)
                    vector = None
                else:
                    word_vector = np.zeros((self.static_embed_size, ),
                                           dtype=np.float32)
                    # alternatively, initialize with random negative values
                    #word_vector = np.random.uniform(low=-0.5, high=0.0, size=(self.embed_size,))
                    # alternatively use fasttext OOV ngram possibilities (if ngram available)
        except lmdb.Error:
            # no idea why, but we need to close and reopen the environment to avoid
            # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot
            # when opening new transaction !
            self.env.close()
            envFilePath = os.path.join(self.embedding_lmdb_path, self.name)
            self.env = lmdb.open(envFilePath,
                                 readonly=True,
                                 max_readers=2048,
                                 max_spare_txns=2,
                                 lock=False)
            return self.get_word_vector(word)
        return word_vector

    def get_ELMo_lmdb_vector(self, token_list, max_size_sentence):
        """
            Try to get the ELMo embeddings for a sequence cached in LMDB
        """
        if self.env_ELMo is None:
            # db cache not available, we don't cache ELMo stuff
            return None
        try:
            ELMo_vector = np.zeros(
                (len(token_list), max_size_sentence - 2, ELMo_embed_size),
                dtype='float32')
            with self.env_ELMo.begin() as txn:
                for i in range(0, len(token_list)):
                    txn = self.env_ELMo.begin()
                    # get a hash for the token_list
                    the_hash = list_digest(token_list[i])
                    vector = txn.get(the_hash.encode(encoding='UTF-8'))
                    if vector:
                        # adapt expected shape/padding
                        local_embeddings = _deserialize_pickle(vector)
                        if local_embeddings.shape[0] > max_size_sentence - 2:
                            # squeeze the extra padding space
                            ELMo_vector[
                                i] = local_embeddings[:max_size_sentence - 2, ]
                        elif local_embeddings.shape[
                                0] == max_size_sentence - 2:
                            # bingo~!
                            ELMo_vector[i] = local_embeddings
                        else:
                            # fill the missing space with padding
                            filler = np.zeros((max_size_sentence -
                                               (local_embeddings.shape[0] + 2),
                                               ELMo_embed_size),
                                              dtype='float32')
                            ELMo_vector[i] = np.concatenate(
                                (local_embeddings, filler))
                        vector = None
                    else:
                        return None
        except lmdb.Error:
            # no idea why, but we need to close and reopen the environment to avoid
            # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot
            # when opening new transaction !
            self.env_ELMo.close()
            self.env_ELMo = lmdb.open(self.embedding_ELMo_cache,
                                      readonly=True,
                                      max_readers=2048,
                                      max_spare_txns=2,
                                      lock=False)
            return self.get_ELMo_lmdb_vector(token_list)
        return ELMo_vector

    def get_BERT_lmdb_vector(self, sentence):
        """
            Try to get the BERT extracted embeddings for a sequence cached in LMDB
        """
        if self.env_BERT is None:
            # db cache not available, we don't cache ELMo stuff
            return None
        try:
            BERT_vector = np.zeros((BERT_sentence_size, BERT_embed_size),
                                   dtype='float32')
            with self.env_BERT.begin() as txn:
                txn = self.env_BERT.begin()
                # get a hash for the token_list
                the_hash = list_digest(sentence)
                vector = txn.get(the_hash.encode(encoding='UTF-8'))

                if vector:
                    # adapt expected shape/padding
                    BERT_vector = _deserialize_pickle(vector)
                    '''
                    if local_embeddings.shape[0] > max_size_sentence:
                        # squeeze the extra padding space
                        BERT_vector = local_embeddings[:max_size_sentence,]
                    elif local_embeddings.shape[0] == max_size_sentence:
                        # bingo~!
                        BERT_vector = local_embeddings
                    else:
                        # fill the missing space with padding
                        filler = np.zeros((max_size_sentence-(local_embeddings.shape[0]), BERT_embed_size), dtype='float32')
                        BERT_vector = np.concatenate((local_embeddings, filler))
                    '''
                    vector = None
                else:
                    return None

        except lmdb.Error:
            # no idea why, but we need to close and reopen the environment to avoid
            # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot
            # when opening new transaction !
            self.env_BERT.close()
            self.env_BERT = lmdb.open(self.embedding_BERT_cache,
                                      readonly=True,
                                      max_readers=2048,
                                      max_spare_txns=2,
                                      lock=False)
            return self.get_BERT_lmdb_vector(sentence)
        return BERT_vector

    def cache_ELMo_lmdb_vector(self, token_list, ELMo_vector):
        """
            Cache in LMDB the ELMo embeddings for a given sequence 
        """
        if self.env_ELMo is None:
            # db cache not available, we don't cache ELMo stuff
            return None
        txn = self.env_ELMo.begin(write=True)
        for i in range(0, len(token_list)):
            # get a hash for the token_list
            the_hash = list_digest(token_list[i])
            txn.put(the_hash.encode(encoding='UTF-8'),
                    _serialize_pickle(ELMo_vector[i]))
        txn.commit()

    def cache_BERT_lmdb_vector(self, sentence, BERT_vector):
        """
            Cache in LMDB the BERT embeddings for a given sequence 
        """
        if self.env_BERT is None:
            # db cache not available, we don't cache BERT stuff
            return None
        txn = self.env_BERT.begin(write=True)
        #for i in range(0, len(sentence)):
        # get a hash for the token_list
        the_hash = list_digest(sentence)
        txn.put(the_hash.encode(encoding='UTF-8'),
                _serialize_pickle(BERT_vector))
        txn.commit()

    def clean_ELMo_cache(self):
        """
            Delete ELMo embeddings cache, this takes place normally after the completion of a training
        """
        if self.env_ELMo is None:
            # db cache not available, nothing to clean
            return
        else:
            self.env_ELMo.close()
            self.env_ELMo = None
            for file in os.listdir(self.embedding_ELMo_cache):
                file_path = os.path.join(self.embedding_ELMo_cache, file)
                if os.path.isfile(file_path):
                    os.remove(file_path)
            os.rmdir(self.embedding_ELMo_cache)

    def clean_BERT_cache(self):
        """
            Delete BERT embeddings cache, this takes place normally after the completion of a training
        """
        # if cache subdirectory does not exist, we create it
        if not os.path.exists(self.embedding_BERT_cache):
            os.makedirs(self.embedding_BERT_cache)
            return

        if self.env_BERT is None:
            # db cache not available, nothing to clean
            return
        else:
            self.env_BERT.close()
            self.env_BERT = None
            for file in os.listdir(self.embedding_BERT_cache):
                file_path = os.path.join(self.embedding_BERT_cache, file)
                if os.path.isfile(file_path):
                    os.remove(file_path)
            os.rmdir(self.embedding_BERT_cache)

    def get_word_vector_in_memory(self, word):
        if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'):
            # the pre-trained embeddings are not cased
            word = word.lower()
        if self.extension == 'bin':
            return self.model.get_word_vector(word)
        if word in self.model:
            return self.model[word]
        else:
            # for unknown word, we use a vector filled with 0.0
            return np.zeros((self.static_embed_size, ), dtype=np.float32)
            # alternatively, initialize with random negative values
            #return np.random.uniform(low=-0.5, high=0.0, size=(self.embed_size,))
            # alternatively use fasttext OOV ngram possibilities (if ngram available)

    def get_embedding_path(self, description):
        embeddings_path = None
        if "path" in description:
            embeddings_path = description["path"]
        self.lang = description["lang"]

        if embeddings_path is None or not os.path.isfile(embeddings_path):
            print("error: embedding path for", description['name'],
                  "is not valid", embeddings_path)
            if "url" in description and len(description["url"]) > 0:
                url = description["url"]
                download_path = self.registry['embedding-download-path']
                # if the download path does not exist, we create it
                if not os.path.isdir(download_path):
                    try:
                        os.mkdir(download_path)
                    except OSError:
                        print("Creation of the download directory",
                              download_path, "failed")

                print("Downloading resource file for", description['name'],
                      "...")
                embeddings_path = download_file(url, download_path)
                if embeddings_path != None and os.path.isfile(embeddings_path):
                    print("Download sucessful:", embeddings_path)
            else:
                print(
                    "no download url available for this embeddings resource, please review the embedding registry for",
                    description['name'])
        return embeddings_path
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 23 15:35:48 2021
@author: xiuzhang
"""
from keras_bert import Tokenizer

token_dict = {
    '[CLS]': 0,
    '[SEP]': 1,
    'un': 2,
    '##aff': 3,
    '##able': 4,
    '[UNK]': 5,
}

#分词器-Tokenizer
tokenizer = Tokenizer(token_dict)
print(tokenizer.tokenize('unaffable'))

#拆分单词
indices, segments = tokenizer.encode('unaffable')
print(indices)  #字对应索引
print(segments)  #索引对应位置上字属于第一句话还是第二句话
print(tokenizer.tokenize(first='unaffable', second='钢'))

indices, segments = tokenizer.encode(first='unaffable', second='钢', max_len=10)
print(indices)
print(segments)
示例#13
0
class KerasBertVector():
    def __init__(self):
        self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len
        # 全局使用,使其可以django、flask、tornado等调用
        global graph
        graph = tf.get_default_graph()
        global model
        model = load_trained_model_from_checkpoint(self.config_path, self.checkpoint_path,
                                                        seq_len=self.max_seq_len)
        model.summary(120)
        # 如果只选一层,就只取对应那一层的weight
        if len(layer_indexes) == 1:
            encoder_layer = model.get_layer(index=len(model.layers)-2).output
        # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
        else:
            # layer_indexes must be [1,2,3,......12]
            all_layers = [model.get_layer(index=lay).output for lay in layer_indexes]
            encoder_layer = k_keras.concatenate(all_layers, -1)
        output_layer = NonMaskingLayer()(encoder_layer)
        model = Model(model.inputs, output_layer)

        # reader tokenizer
        self.token_dict = {}
        with codecs.open(self.dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)

        self.tokenizer = Tokenizer(self.token_dict)

    def bert_encode(self, texts):
        # 文本预处理
        input_ids = []
        input_masks = []
        input_type_ids = []
        for text in texts:
            print(text)
            tokens_text = self.tokenizer.tokenize(text)
            print('Tokens:', tokens_text)
            input_id, input_type_id = self.tokenizer.encode(first=text, max_len=self.max_seq_len)
            input_mask = [0 if ids == 0 else 1 for ids in input_id]
            input_ids.append(input_id)
            input_type_ids.append(input_type_id)
            input_masks.append(input_mask)

        input_ids = np.array(input_ids)
        input_masks = np.array(input_masks)
        input_type_ids = np.array(input_type_ids)

        # 全局使用,使其可以django、flask、tornado等调用
        with graph.as_default():
            predicts = model.predict([input_ids, input_type_ids], batch_size=1)
        print(predicts.shape)
        for i, token in enumerate(tokens_text):
            print(token, [len(predicts[0][i].tolist())], predicts[0][i].tolist())

        # 相当于pool,采用的是https://github.com/terrifyzhao/bert-utils/blob/master/graph.py
        mul_mask = lambda x, m: x * np.expand_dims(m, axis=-1)
        masked_reduce_mean = lambda x, m: np.sum(mul_mask(x, m), axis=1) / (np.sum(m, axis=1, keepdims=True) + 1e-9)
        pooled = masked_reduce_mean(predicts[0][-1], input_masks)
        pooled = pooled.tolist()
        print('bert:', pooled)
        return pooled
示例#14
0
from keras_bert.datasets import get_pretrained, PretrainedList
model_path = get_pretrained(
    PretrainedList.chinese_base)  # download chinese pre-trained model

paths = get_checkpoint_paths(model_path)
model = load_trained_model_from_checkpoint(paths.config,
                                           paths.checkpoint,
                                           seq_len=10)
model.summary(line_length=120)
plot_model(model, to_file="keras_bert.png", show_shapes=True)  # loss确定 SEP 标记?

token_dict = load_vocabulary(paths.vocab)

tokenizer = Tokenizer(token_dict)
text = '语言模型'
tokens = tokenizer.tokenize(text)
print('Tokens:', tokens)
indices, segments = tokenizer.encode(first=text, max_len=10)
print("indices:", indices)
print("segments:", segments)

predicts = model.predict([np.array([indices]), np.array([segments])])[0]
for i, token in enumerate(tokens):
    print(token, predicts[i].tolist()[:5])  # extract word embedding

# load and predict

model = load_trained_model_from_checkpoint(paths.config,
                                           paths.checkpoint,
                                           training=True,
                                           seq_len=None)
class BertBiLstmModel():
    def __init__(self):
        # logger.info("BertBiLstmModel init start!")
        print("BertBiLstmModel init start!")
        self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len
        # reader tokenizer
        self.token_dict = {}
        with codecs.open(self.dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)

        self.tokenizer = Tokenizer(self.token_dict)
        # 你可以选择一个model build,有bi-lstm single、bi-lstm 3-layers、bi-lstm_attention
        # self.build_model_bilstm_layers()
        # self.build_model_bilstm_single()
        self.build_model_bilstm_attention()
        # logger.info("BertBiLstmModel init end!")
        print("BertBiLstmModel init end!")

    def process_single(self, texts):
        # 文本预处理,传入一个list,返回的是ids\mask\type-ids
        input_ids = []
        input_masks = []
        input_type_ids = []
        for text in texts:
            logger.info(text)
            tokens_text = self.tokenizer.tokenize(text)
            logger.info('Tokens:', tokens_text)
            input_id, input_type_id = self.tokenizer.encode(first=text, max_len=self.max_seq_len)
            input_mask = [0 if ids == 0 else 1 for ids in input_id]
            input_ids.append(input_id)
            input_type_ids.append(input_type_id)
            input_masks.append(input_mask)
        # numpy处理list
        input_ids = np.array(input_ids)
        input_masks = np.array(input_masks)
        input_type_ids = np.array(input_type_ids)
        logger.info("process ok!")
        return input_ids, input_masks, input_type_ids

    def process_pair(self, textss):
        # 文本预处理,传入一个list,返回的是ids\mask\type-ids
        input_ids = []
        input_masks = []
        input_type_ids = []
        for texts in textss:
            tokens_text = self.tokenizer.tokenize(texts[0])
            logger.info('Tokens1:', tokens_text)
            tokens_text2 = self.tokenizer.tokenize(texts[1])
            logger.info('Tokens2:', tokens_text2)
            input_id, input_type_id = self.tokenizer.encode(first=texts[0], second=texts[1], max_len=self.max_seq_len)
            input_mask = [0 if ids == 0 else 1 for ids in input_id]
            input_ids.append(input_id)
            input_type_ids.append(input_type_id)
            input_masks.append(input_mask)
        # numpy处理list
        input_ids = np.array(input_ids)
        input_masks = np.array(input_masks)
        input_type_ids = np.array(input_type_ids)
        logger.info("process ok!")
        return input_ids, input_masks, input_type_ids

    def build_model_bilstm_layers(self):
        if args.use_lstm:
            if args.use_cudnn_cell:
                layer_cell = CuDNNLSTM
            else:
                layer_cell = LSTM
        else:
            if args.use_cudnn_cell:
                layer_cell = CuDNNGRU
            else:
                layer_cell = GRU
        # bert embedding
        bert_inputs, bert_output = KerasBertEmbedding().bert_encode()
        # bert_output = bert_output[:0:]
        # layer_get_cls = Lambda(lambda x: x[:, 0:1, :])
        # bert_output = layer_get_cls(bert_output)
        # print("layer_get_cls:")
        # print(bert_output.shape)
        # Bi-LSTM
        x = Bidirectional(layer_cell(units=args.units, return_sequences=args.return_sequences,
                                     kernel_regularizer=regularizers.l2(args.l2 * 0.1),
                                     recurrent_regularizer=regularizers.l2(args.l2)
                                     ))(bert_output)
        # blstm_layer = TimeDistributed(Dropout(args.keep_prob))(blstm_layer) 这个用不了,好像是输入不对, dims<3吧
        x = Dropout(args.keep_prob)(x)

        x = Bidirectional(layer_cell(units=args.units, return_sequences=args.return_sequences,
                                     kernel_regularizer=regularizers.l2(args.l2 * 0.1),
                                     recurrent_regularizer=regularizers.l2(args.l2)))(x)
        x = Dropout(args.keep_prob)(x)
        x = Bidirectional(layer_cell(units=args.units, return_sequences=args.return_sequences,
                                     kernel_regularizer=regularizers.l2(args.l2 * 0.1),
                                     recurrent_regularizer=regularizers.l2(args.l2)))(x)
        x = Dropout(args.keep_prob)(x)

        # 平均池化、最大池化拼接
        avg_pool = GlobalAvgPool1D()(x)
        max_pool = GlobalMaxPool1D()(x)
        print(max_pool.shape)
        print(avg_pool.shape)
        concat = concatenate([avg_pool, max_pool])
        x = Dense(int(args.units / 4), activation="relu")(concat)
        x = Dropout(args.keep_prob)(x)

        # 最后就是softmax
        dense_layer = Dense(args.label, activation=args.activation)(x)
        output_layers = [dense_layer]
        self.model = Model(bert_inputs, output_layers)

    def build_model_bilstm_attention(self):
        if args.use_lstm:
            if args.use_cudnn_cell:
                layer_cell = CuDNNLSTM
            else:
                layer_cell = LSTM
        else:
            if args.use_cudnn_cell:
                layer_cell = CuDNNGRU
            else:
                layer_cell = GRU
        # bert embedding
        bert_inputs, bert_output = KerasBertEmbedding().bert_encode()
        # Bi-LSTM
        x = Bidirectional(layer_cell(units=args.units, return_sequences=args.return_sequences,
                                     kernel_regularizer=regularizers.l2(args.l2 * 0.1),
                                     recurrent_regularizer=regularizers.l2(args.l2)
                                     ))(bert_output)
        x = TimeDistributed(Dropout(args.keep_prob))(x)  # 这个用不了,好像是输入不对, dims<3吧
        x = attention(x)
        x = Flatten()(x)
        x = Dropout(args.keep_prob)(x)

        # # 平均池化、最大池化拼接
        # avg_pool = GlobalAvgPool1D()(x)
        # max_pool = GlobalMaxPool1D()(x)
        # print(max_pool.shape)
        # print(avg_pool.shape)
        # concat = concatenate([avg_pool, max_pool])
        # x = Dense(int(args.units/4), activation="relu")(concat)
        # x = Dropout(args.keep_prob)(x)

        # 最后就是softmax
        dense_layer = Dense(args.label, activation=args.activation)(x)
        output_layers = [dense_layer]
        self.model = Model(bert_inputs, output_layers)

    def build_model_bilstm_single(self):
        if args.use_lstm:
            if args.use_cudnn_cell:
                layer_cell = CuDNNLSTM
            else:
                layer_cell = LSTM
        else:
            if args.use_cudnn_cell:
                layer_cell = CuDNNGRU
            else:
                layer_cell = GRU
        # bert embedding
        bert_inputs, bert_output = KerasBertEmbedding().bert_encode()
        # Bi-LSTM
        x = Bidirectional(layer_cell(units=args.units, return_sequences=args.return_sequences,
                                     kernel_regularizer=regularizers.l2(args.l2 * 0.1),
                                     recurrent_regularizer=regularizers.l2(args.l2)
                                     ))(bert_output)
        x = Dropout(args.keep_prob)(x)

        # 最后就是softmax
        dense_layer = Dense(args.label, activation=args.activation)(x)
        output_layers = [dense_layer]
        self.model = Model(bert_inputs, output_layers)

    def compile_model(self):
        self.model.compile(optimizer=args.optimizers,
                           loss=categorical_crossentropy,
                           metrics=args.metrics)

    def callback(self):
        cb = [ModelCheckpoint(args.path_save_model, monitor='val_loss',
                              verbose=1, save_best_only=True, save_weights_only=False, mode='min'),
              EarlyStopping(min_delta=1e-8, patience=10, mode='min'),
              ReduceLROnPlateau(factor=0.2, patience=6, verbose=0, mode='min', epsilon=1e-6, cooldown=4, min_lr=1e-8)
              ]
        return cb

    def fit(self, x_train, y_train, x_dev, y_dev):
        self.model.fit(x_train, y_train, batch_size=args.batch_size,
                       epochs=args.epochs, validation_data=(x_dev, y_dev),
                       shuffle=True,
                       callbacks=self.callback())
        self.model.save(args.path_save_model)

    def load_model(self):
        print("BertBiLstmModel load_model start!")
        # logger.info("BertBiLstmModel load_model start!")
        self.model.load_weights(args.path_save_model)
        # logger.info("BertBiLstmModel load_model end+!")
        print("BertBiLstmModel load_model end+!")

    def predict(self, sen_1, sen_2):
        input_ids, input_masks, input_type_ids = self.process_pair([[sen_1, sen_2]])
        return self.model.predict([input_ids, input_masks], batch_size=1)

    def predict_list(self, questions):
        label_preds = []
        for questions_pair in questions:
            input_ids, input_masks, input_type_ids = self.process_pair([questions_pair])
            label_pred = self.model.predict([input_ids, input_masks], batch_size=1)
            label_preds.append(label_pred[0])
        return label_preds
示例#16
0
class KerasBertVector ():
    def __init__(self):
        self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len
        # 全局使用,使其可以django、flask、tornado等调用
        # global graph
        # graph = tf.compat.v1.get_default_graph ()
        global model
        model = load_trained_model_from_checkpoint (self.config_path, self.checkpoint_path,
                                                    seq_len=self.max_seq_len)
        # print (model.output)
        # print (len (model.layers))
        # lay = model.layers
        # 一共104个layer,其中前八层包括token,pos,embed等,
        # 每4层(MultiHeadAttention,Dropout,Add,LayerNormalization)
        # 一共24层
        layer_dict = [7]
        layer_0 = 7
        for i in range (12):
            layer_0 = layer_0 + 4
            layer_dict.append (layer_0)
        # 输出它本身
        if len (layer_indexes) == 0:
            encoder_layer = model.output
        # 分类如果只有一层,就只取最后那一层的weight,取得不正确
        elif len (layer_indexes) == 1:
            if layer_indexes[0] in [i + 1 for i in range (12)]:
                encoder_layer = model.get_layer (index=layer_dict[layer_indexes[0]]).output
            else:
                encoder_layer = model.get_layer (index=layer_dict[-2]).output
        # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
        else:
            # layer_indexes must be [1,2,3,......12...24]
            # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes]
            all_layers = [model.get_layer (index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range (12)]
                          else model.get_layer (index=layer_dict[-1]).output  # 如果给出不正确,就默认输出最后一层
                          for lay in layer_indexes]
            # print (layer_indexes)
            # print (all_layers)
            # 其中layer==1的output是格式不对,第二层输入input是list
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append (all_layers_one)
            encoder_layer = Add () (all_layers_select)
            # print (encoder_layer.shape)
        # print ("KerasBertEmbedding:")
        # print (encoder_layer.shape)
        output_layer = NonMaskingLayer () (encoder_layer)
        model = Model (model.inputs, output_layer)
        # model.summary(120)
        # reader tokenizer
        self.token_dict = {}
        with codecs.open (self.dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip ()
                self.token_dict[token] = len (self.token_dict)

        self.tokenizer = Tokenizer (self.token_dict)

    def bert_encode_sen(self, texts):
        # 文本预处理
        input_ids = []
        input_masks = []
        input_type_ids = []
        for text in texts:
            # print (text)
            tokens_text = self.tokenizer.tokenize (text)
            # print ('Tokens:', tokens_text)
            input_id, input_type_id = self.tokenizer.encode (first=text, max_len=self.max_seq_len)
            input_mask = [0 if ids == 0 else 1 for ids in input_id]
            input_ids.append (input_id)
            input_type_ids.append (input_type_id)
            input_masks.append (input_mask)

        input_ids = np.array (input_ids)
        input_masks = np.array (input_masks)
        input_type_ids = np.array (input_type_ids)

        # 全局使用,使其可以django、flask、tornado等调用
        # with graph.as_default ():
        predicts = model.predict ([input_ids, input_type_ids], batch_size=1)
        # print (predicts.shape)
        # for i, token in enumerate (tokens_text):
        #     (token, [len (predicts[0][i].tolist ())], predicts[0][i].tolist ())

        # 相当于pool,采用的是https://github.com/terrifyzhao/bert-utils/blob/master/graph.py
        mul_mask = lambda x, m: x * np.expand_dims (m, axis=-1)
        masked_reduce_mean = lambda x, m: np.sum (mul_mask (x, m), axis=1) / (np.sum (m, axis=1, keepdims=True) + 1e-9)

        pools = []
        for i in range (len (predicts)):
            pred = predicts[i]
            masks = input_masks.tolist ()
            mask_np = np.array ([masks[i]])
            pooled = masked_reduce_mean (pred, mask_np)
            pooled = pooled.tolist ()
            pools.append (pooled[0])
        # print ('bert:', pools)
        return pools

    def bert_encode_word(self, texts):
        # 文本预处理
        input_ids = []
        input_masks = []
        input_type_ids = []
        for text in texts:
            # print (text)
            tokens_text = self.tokenizer.tokenize (text)
            # print ('Tokens:', tokens_text)
            input_id, input_type_id = self.tokenizer.encode (first=text, max_len=self.max_seq_len)
            input_mask = [0 if ids == 0 else 1 for ids in input_id]
            input_ids.append (input_id)
            input_type_ids.append (input_type_id)
            input_masks.append (input_mask)

        input_ids = np.array (input_ids)
        input_masks = np.array (input_masks)
        input_type_ids = np.array (input_type_ids)

        # 全局使用,使其可以django、flask、tornado等调用
        # with graph.as_default ():
        predicts = model.predict ([input_ids, input_type_ids], batch_size=1)
        # print (predicts.shape)
        # for i, token in enumerate (tokens_text):
        #     (token, [len (predicts[0][i].tolist ())], predicts[0][i].tolist ())
        words_vec=predicts[0][1:len(tokens_text)-1]
        words_vec = np.array (words_vec)
        words_vec = (words_vec.astype (np.float32))
        ret=[]
        for i in words_vec:
            ret.append(i)
        return ret
    def gen_sen_vec(self,sen):
        pooled = self.bert_encode_sen([sen])
        vec = pooled[0]
        vec = np.array (vec)
        vec.tolist ()
        return vec
    def gen_words_vec(self,sen):
        pooled = self.bert_encode_word ([sen])
        vec = pooled[0]
        vec = np.array (vec)
        vec.tolist ()
        return vec
示例#17
0
    except:
        token_dict = {}
        with codecs.open('uncased_L-12_H-768_A-12/vocab.txt', 'r',
                         'utf8') as reader:
            for line in reader:
                token = line.strip()
                token_dict[token] = len(token_dict)
        with open('bert_token_dict.pkl', 'wb') as f:
            pickle.dump(token_dict, f)

    # with codecs.open('uncased_L-12_H-768_A-12/vocab.txt', 'r', 'utf8') as reader:
    #     vocab = [line.strip() for line in reader]

    tokenizer = Tokenizer(token_dict)
    tokens = [
        tokenizer.tokenize(" ".join(sentence))
        for sentence in train_sentences + test_sentences
    ]
    maxlen = max([len(sentence) for sentence in tokens])
    for i, sentence in enumerate(tokens):
        while len(tokens[i]) < maxlen:
            tokens[i].append('[PAD]')

    # print(os.getcwd())
    # print(len(tokens[5]))
    print('maxlen_bert :', maxlen)
    # indices, segments = tokenizer.encode(first=' '.join(test_sentences[0]), max_len=maxlen)
    # print(indices)
    # print(" ".join(get_word(w) for w in indices))
    # for w in indices:
    #     print
示例#18
0
from keras_bert import Tokenizer

token_dict = {
    '[CLS]': 0,
    '[SEP]': 1,
    'un': 2,
    '##aff': 3,
    '##able': 4,
    '[UNK]': 5,
}
tokenizer = Tokenizer(token_dict)
print(tokenizer.tokenize('unaffable')
      )  # The result should be `['[CLS]', 'un', '##aff', '##able', '[SEP]']`
indices, segments = tokenizer.encode('unaffable')
print(indices)  # Should be `[0, 2, 3, 4, 1]`
print(segments)  # Should be `[0, 0, 0, 0, 0]`
示例#19
0
class BertTextCnnModel():
    def __init__(self):
        # logger.info("BertBiLstmModel init start!")
        print("BertBiLstmModel init start!")
        self.config_path, self.checkpoint_path, self.dict_path = config_name, ckpt_name, vocab_file
        self.max_seq_len, self.filters, self.embedding_dim, self.keep_prob = args.max_seq_len, args.filters, args.embedding_dim, args.keep_prob
        self.activation, self.label = args.activation, args.label
        # reader tokenizer
        self.token_dict = {}
        with codecs.open(self.dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)

        self.tokenizer = Tokenizer(self.token_dict)
        # 这里模型可以选text-rnn、r-cnn或者是avt-cnn
        # self.build_model_text_cnn()
        # self.build_model_r_cnn()
        self.build_model_avt_cnn()
        # logger.info("BertBiLstmModel init end!")
        print("BertBiLstmModel init end!")

    def build_model_text_cnn(self):
        #########    text-cnn    #########
        # bert embedding
        bert_inputs, bert_output = KerasBertEmbedding().bert_encode()
        # text cnn
        bert_output_emmbed = SpatialDropout1D(rate=self.keep_prob)(bert_output)
        concat_out = []
        for index, filter_size in enumerate(self.filters):
            x = Conv1D(name='TextCNN_Conv1D_{}'.format(index),
                       filters=int(self.embedding_dim / 2),
                       kernel_size=self.filters[index],
                       padding='valid',
                       kernel_initializer='normal',
                       activation='relu')(bert_output_emmbed)
            x = GlobalMaxPooling1D(
                name='TextCNN_MaxPool1D_{}'.format(index))(x)
            concat_out.append(x)
        x = Concatenate(axis=1)(concat_out)
        x = Dropout(self.keep_prob)(x)

        # 最后就是softmax
        dense_layer = Dense(self.label, activation=self.activation)(x)
        output_layers = [dense_layer]
        self.model = Model(bert_inputs, output_layers)

    def build_model_r_cnn(self):
        #########    RCNN    #########
        # bert embedding
        bert_inputs, bert_output = KerasBertEmbedding().bert_encode()
        # rcnn
        bert_output_emmbed = SpatialDropout1D(rate=self.keep_prob)(bert_output)
        if args.use_lstm:
            if args.use_cudnn_cell:
                layer_cell = CuDNNLSTM
            else:
                layer_cell = LSTM
        else:
            if args.use_cudnn_cell:
                layer_cell = CuDNNGRU
            else:
                layer_cell = GRU

        x = Bidirectional(
            layer_cell(units=args.units,
                       return_sequences=args.return_sequences,
                       kernel_regularizer=regularizers.l2(args.l2 * 0.1),
                       recurrent_regularizer=regularizers.l2(
                           args.l2)))(bert_output_emmbed)
        x = Dropout(args.keep_prob)(x)
        x = Conv1D(filters=int(self.embedding_dim / 2),
                   kernel_size=2,
                   padding='valid',
                   kernel_initializer='normal',
                   activation='relu')(x)
        x = GlobalMaxPooling1D()(x)
        x = Dropout(args.keep_prob)(x)
        # 最后就是softmax
        dense_layer = Dense(self.label, activation=self.activation)(x)
        output_layers = [dense_layer]
        self.model = Model(bert_inputs, output_layers)

    def build_model_avt_cnn(self):
        #########text-cnn#########
        # bert embedding
        bert_inputs, bert_output = KerasBertEmbedding().bert_encode()
        # text cnn
        bert_output_emmbed = SpatialDropout1D(rate=self.keep_prob)(bert_output)
        concat_x = []
        concat_y = []
        concat_z = []
        for index, filter_size in enumerate(self.filters):
            conv = Conv1D(name='TextCNN_Conv1D_{}'.format(index),
                          filters=int(self.embedding_dim / 2),
                          kernel_size=self.filters[index],
                          padding='valid',
                          kernel_initializer='normal',
                          activation='relu')(bert_output_emmbed)
            x = GlobalMaxPooling1D(
                name='TextCNN_MaxPooling1D_{}'.format(index))(conv)
            y = GlobalAveragePooling1D(
                name='TextCNN_AveragePooling1D_{}'.format(index))(conv)
            z = AttentionWeightedAverage(
                name='TextCNN_Annention_{}'.format(index))(conv)
            concat_x.append(x)
            concat_y.append(y)
            concat_z.append(z)

        merge_x = Concatenate(axis=1)(concat_x)
        merge_y = Concatenate(axis=1)(concat_y)
        merge_z = Concatenate(axis=1)(concat_z)
        merge_xyz = Concatenate(axis=1)([merge_x, merge_y, merge_z])
        x = Dropout(self.keep_prob)(merge_xyz)

        # 最后就是softmax
        dense_layer = Dense(self.label, activation=self.activation)(x)
        output_layers = [dense_layer]
        self.model = Model(bert_inputs, output_layers)

    def compile_model(self):
        self.model.compile(optimizer=args.optimizers,
                           loss=categorical_crossentropy,
                           metrics=args.metrics)

    def callback(self):
        c_b = [
            ModelCheckpoint(args.path_save_model,
                            monitor='val_loss',
                            verbose=1,
                            save_best_only=True,
                            save_weights_only=False,
                            mode='min'),
            EarlyStopping(min_delta=1e-9, patience=4, mode='min')
        ]
        return c_b

    def fit(self, x_train, y_train, x_dev, y_dev):
        self.model.fit(x_train,
                       y_train,
                       batch_size=args.batch_size,
                       epochs=args.epochs,
                       validation_data=(x_dev, y_dev),
                       shuffle=True,
                       callbacks=self.callback())
        self.model.save(args.path_save_model)

    def load_model(self):
        print("BertBiLstmModel load_model start!")
        # logger.info("BertBiLstmModel load_model start!")
        self.model.load_weights(args.path_save_model)
        # logger.info("BertBiLstmModel load_model end+!")
        print("BertBiLstmModel load_model end+!")

    def process_pair(self, textss):
        # 文本预处理,传入一个list,返回的是ids\mask\type-ids
        input_ids = []
        input_masks = []
        input_type_ids = []
        for texts in textss:
            tokens_text = self.tokenizer.tokenize(texts[0])
            logger.info('Tokens1:', tokens_text)
            tokens_text2 = self.tokenizer.tokenize(texts[1])
            logger.info('Tokens2:', tokens_text2)
            input_id, input_type_id = self.tokenizer.encode(
                first=texts[0], second=texts[1], max_len=self.max_seq_len)
            input_mask = [0 if ids == 0 else 1 for ids in input_id]
            input_ids.append(input_id)
            input_type_ids.append(input_type_id)
            input_masks.append(input_mask)
        # numpy处理list
        input_ids = np.array(input_ids)
        input_masks = np.array(input_masks)
        input_type_ids = np.array(input_type_ids)
        logger.info("process ok!")
        return input_ids, input_masks, input_type_ids

    def predict(self, sen_1, sen_2):
        input_ids, input_masks, input_type_ids = self.process_pair(
            [[sen_1, sen_2]])
        return self.model.predict([input_ids, input_masks], batch_size=1)

    def predict_list(self, questions):
        label_preds = []
        for questions_pair in questions:
            input_ids, input_masks, input_type_ids = self.process_pair(
                [questions_pair])
            label_pred = self.model.predict([input_ids, input_masks],
                                            batch_size=1)
            label_preds.append(label_pred[0])
        return label_preds
示例#20
0
model = load_trained_model_from_checkpoint(config_path,
                                           checkpoint_path,
                                           training=True)
model.summary(line_length=120)

token_dict = {}
with codecs.open(dict_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)
token_dict_inv = {v: k for k, v in token_dict.items()}

tokenizer = Tokenizer(token_dict)
text = '数学是利用符号语言研究数量、结构、变化以及空间等概念的一门学科'
tokens = tokenizer.tokenize(text)
tokens[1] = tokens[2] = '[MASK]'
print('Tokens:', tokens)

indices = np.array([[token_dict[token]
                     for token in tokens] + [0] * (512 - len(tokens))])
segments = np.array([[0] * len(tokens) + [0] * (512 - len(tokens))])
masks = np.array([[0, 1, 1] + [0] * (512 - 3)])

predicts = model.predict([indices, segments,
                          masks])[0].argmax(axis=-1).tolist()
print('Fill with: ', list(map(lambda x: token_dict_inv[x], predicts[0][1:3])))

sentence_1 = '数学是利用符号语言研究數量、结构、变化以及空间等概念的一門学科。'
sentence_2 = '从某种角度看屬於形式科學的一種。'
print('Tokens:', tokenizer.tokenize(first=sentence_1, second=sentence_2))
示例#21
0
class Preprocess:

  def __init__(self, DATASET, DOMAIN, PAIRS, COLAB, PREPROCESSING):
    self.MAX_NB_WORDS = 20000
    self.VALIDATION_SPLIT = 0.9
    self.COLAB = COLAB
    self.PREPROCESSING = PREPROCESSING
    self.DIR = '{}data/processed'.format(COLAB) # where will be exported
    self.DATASET=DATASET
    self.DOMAIN=DOMAIN
    self.PAIRS = PAIRS
    self.nlp = spacy.load('en_core_web_lg')
    self.bugs = {}
    self.bugs_saved = []
    self.TRAIN_PATH = 'train_chronological'
    self.TEST_PATH = 'test_chronological'
    self.MAX_SEQUENCE_LENGTH_T = 50
    self.MAX_SEQUENCE_LENGTH_D = 150

    self.start()
    self.tokenizer_init()

    self.improve_ner(self.nlp)

  def tokenizer_init(self):
    pretrained_path = 'uncased_L-12_H-768_A-12'
    config_path = os.path.join(pretrained_path, 'bert_config.json')
    model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
    vocab_path = os.path.join(pretrained_path, 'vocab.txt')

    token_dict = load_vocabulary(vocab_path)
    print("Total vocabulary loaded: {}".format(len(token_dict)))

    self.tokenizer = Tokenizer(token_dict)

  def start(self):

    self.ENTITY_ENUM = {
        '': 'unknown',
        'PERSON': 'person',
        'NORP': 'nationality',
        'FAC': 'facility',
        'ORG': 'organization',
        'GPE': 'country',
        'LOC': 'location',
        'PRODUCT': 'product',
        'EVENT': 'event',
        'WORK_OF_ART': 'artwork',
        'LANGUAGE': 'language',
        'DATE': 'date',
        'TIME': 'time',
        # 'PERCENT': 'percent',
        # 'MONEY': 'money',
        # 'QUANTITY': 'quantity',
        # 'ORDINAL': 'ordinal',
        # 'CARDINAL': 'cardinal',
        'PERCENT': 'number',
        'MONEY': 'number',
        'QUANTITY': 'number',
        'ORDINAL': 'number',
        'CARDINAL': 'number',
        'LAW': 'law'
    }

    # Keyboards
    self.keyboards = [u'ctrl', u'CTRL', u'CTRL\+TAB', u'ctrl\+tab', u'ESC', u'Esc', u'esc', u'crtl \+ space', 
                u'CTRL \+ SPACE', u'CTRL + Space', u'CTRL\-C', u'CTRL\-V', u'ctrl\-c', u'ctrl\-v', u'Ctrl-z', u'Ctrl - z',
                u'CTRL-z', u'Ctrl+z', u'ctrl-z', u'ctrl+z', u'CTRL - z', u'Ctrl + z', u'CTRL+z', u'CTRL+Z', u'CTRL + Z',
                u'CTRL- Z']
    for i in range(0, 13):
        # Ctrl+number
        self.keyboards.append(u'CTRL\+{}'.format(i))
        self.keyboards.append(u'Ctrl\+{}'.format(i))
        self.keyboards.append(u'ctrl\+{}'.format(i))
        self.keyboards.append(u'CTRL \+ {}'.format(i))
        self.keyboards.append(u'Ctrl \+ {}'.format(i))
        self.keyboards.append(u'ctrl \+ {}'.format(i))
        self.keyboards.append(u'CTRL\-{}'.format(i))
        self.keyboards.append(u'Ctrl\-{}'.format(i))
        self.keyboards.append(u'ctrl\-{}'.format(i))
        # F+number
        self.keyboards.append(u'F{}'.format(i))
        self.keyboards.append(u'f{}'.format(i))

  def expand_contractions(self, text, contractions_dict):
      contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                        flags=re.IGNORECASE | re.DOTALL)
      re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                        flags=re.IGNORECASE | re.DOTALL)
      def expand_match(contraction):
              match = contraction.group(0)
              first_char = match[0]
              expanded_contraction = contractions_dict.get(match) \
                  if contractions_dict.get(match) \
                  else contractions_dict.get(match.lower())
              expanded_contraction = expanded_contraction
              return expanded_contraction
      
      expanded_text = contractions_pattern.sub(expand_match, text)
      expanded_text = re.sub("'", "", expanded_text)
      return expanded_text

  def save_buckets(self, buckets):
    with open(os.path.join(self.DIR, self.BASE + '_buckets.pkl'), 'wb') as f:
      pickle.dump(buckets, f)

  def read_pairs(self, df):
    bug_pairs = []
    bucket_dups = []
    bug_ids = set()
    buckets = self.create_bucket(df)
    self.save_buckets(buckets)
    # buckets
    for key in buckets:
      if len(buckets[key]) > 1:
          bucket_dups.append([key, list(buckets[key])])

    bug_pairs, bug_ids = self.getting_pairs(bucket_dups)

    with open(os.path.join(self.DIR, 'bug_pairs.txt'), 'w') as f:
      for pair in bug_pairs:
        f.write("{} {}\n".format(pair[0], pair[1]))
    bug_ids = sorted(bug_ids)
    with open(os.path.join(self.DIR, 'bug_ids.txt'), 'w') as f:
      for bug_id in bug_ids:
        f.write("%d\n" % bug_id)
    return bug_pairs, bug_ids

  def split_train_test(self, bug_pairs, VALIDATION_SPLIT):
    random.shuffle(bug_pairs)
    split_idx = int(len(bug_pairs) * VALIDATION_SPLIT)
    with open(os.path.join(self.DIR, '{}.txt'.format(self.TRAIN_PATH)), 'w') as f:
      for pair in bug_pairs[:split_idx]:
        f.write("{} {}\n".format(pair[0], pair[1]))
    test_data = {}
    for pair in bug_pairs[split_idx:]:
      bug1 = int(pair[0])
      bug2 = int(pair[1])
      if bug1 not in test_data:
        test_data[bug1] = set()
      test_data[bug1].add(bug2)
    with open(os.path.join(self.DIR, '{}.txt'.format(self.TEST_PATH)), 'w') as f:
      for bug in test_data.keys():
        f.write("{} {}\n".format(bug, ' '.join([str(x) for x in test_data[bug]])))
    print('Train and test created')

  def func_name_tokenize(self, text):
    s = []
    for i, c in enumerate(text):
      if c.isupper() and i > 0 and text[i-1].islower():
        s.append(' ')
      s.append(c)
    return ''.join(s).strip()

  def improve_ner(self, nlp):
    # Dates
    dates = ['sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 
        'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
    for year in range(2000, 2012):
        for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Oct', 'Nov', 'Dec']:
            for day in range(32):
                dates.append( u'{} {}, {}'.format(day, month, year))

    # Steps
    steps = []
    for i in range(15):
        steps.append(u'{}. '.format(i))
        steps.append(u'({}) '.format(i))
        steps.append(u'{}) '.format(i))
    
    list_terms = [dates,
                  (u'MacOS', u'MacOS X', u'MacOS x', u'Mac OS X', u'Redhat Linux', u'RedHat Enterprise',
                  u'Linux', u'Windows XP', u'WindowsXP', u'Windows NT', u'Fedora Core', u'Red Hat'),
                  steps
                ]
    list_labels = ['DATE', "OS", "STEP INDEX"] 

    self.allow_ner = ['person', 'product', 'time', 'language', 'organization', 'number']

    self.allow_ner += [ent.lower() for ent in list_labels]

    for terms, label in zip(list_terms, list_labels):
        entity_matcher = EntityMatcher(label, nlp, terms, label)
        nlp.add_pipe(entity_matcher, after='ner')

  def ner(self, text):
    corpus = self.nlp(text)
    ents, start_char, end_char = [], [], []
    
    ents = [self.ENTITY_ENUM[row.label_] if row.label_ in self.ENTITY_ENUM else row.label_ for row in corpus.ents]
    starts_char = np.array([row.start_char for row in corpus.ents])
    ends_char = np.array([row.end_char for row in corpus.ents])
    
    for index, ent, start_pos, end_pos in zip(range(len(ents)), ents, starts_char, ends_char):
        if ent.lower() in self.allow_ner:
            replaced = " {} ".format(ent.lower())
            text = text[:start_pos] + replaced + text[end_pos:]
            diff_replaced = len(replaced) - len(text[start_pos:end_pos])
            if diff_replaced > 0: # push
                starts_char[index+1:] += diff_replaced
                ends_char[index+1:] += diff_replaced
            elif diff_replaced < 0: # pull
                starts_char[index+1:] -= (diff_replaced * -1)
                ends_char[index+1:] -= (diff_replaced * -1)
    return text

  def normalize_text(self, text):
    if self.PREPROCESSING == 'bert':
      text = " ".join(self.tokenizer.tokenize(str(text)))
    else:
      text = re.sub(r'[0-9]{1,} (min|minutes|minute|m)', 'x time', str(text)) # [0-9] min
      # Extension files
      #text = re.sub(r'(WAR|zip|ZIP|css)', 'extension file', text) # extension file
      #text = re.sub(r'.(zip|txt|java|js|html|php|pdf|exe|doc|jar|xml)', ' extension file', text) # extension file
      # Memory 
      text = re.sub(r'kB', 'kb', text)
      # Keyboards
      text = re.sub(r'('+('|'.join(self.keyboards))+')', 'keyboard', text) # key board
      # Contraction
      text=self.expand_contractions(text, contractions_dict)

      # NER processing
      text = text[:100000] # limit of spacy lib
      text = self.ner(text)

      tokens = re.compile(r'[\W_]+', re.UNICODE).split(text)
      text = ' '.join([self.func_name_tokenize(token) for token in tokens])
      #     text = ' '.join(tokens)
      
      text = re.sub(r'\d+((\s\d+)+)?', ' ', text)
      text = [word.lower() for word in nltk.word_tokenize(text)]
      text = ' '.join([word for word in text]).encode('utf-8')
    return text

  def save_dict(self, set, filename):
    with open(filename, 'w') as f:
      for i, item in enumerate(set):
        f.write('%s\t%d\n' % (item, i))

  def load_dict(self, filename):
    dict = {}
    with open(filename, 'r') as f:
      for line in f:
        tokens = line.split('\t')
        dict[tokens[0]] = tokens[1]
    return dict

  def normalized_data(self, bug_ids, df):
    print("Normalizing text...")
    products = set()
    bug_severities = set()
    priorities = set()
    versions = set()
    components = set()
    bug_statuses = set()
    text = []
    normalized_bugs_json = []
    print("Total:", df.shape[0])
    res = self.paralelize_processing(df, self.processing_normalized_data, (self.normalize_text, ))
    for result in res:
      if self.BASE != 'firefox':
        products = products.union(result[0])
        bug_severities = bug_severities.union(result[1])
      priorities = priorities.union(result[2])
      versions = versions.union(result[3])
      components = components.union(result[4])
      bug_statuses = bug_statuses.union(result[5])
      text += result[6]
      normalized_bugs_json += result[7]
    print("Total of normalized: ", len(normalized_bugs_json))
    print("Writing the normalized_bugs.json")
    with open(os.path.join(self.DIR, 'normalized_bugs.json'), 'w') as f:
      for row in tqdm(normalized_bugs_json):
        f.write(row)
    
    if self.BASE != 'firefox':
        self.save_dict(products, os.path.join(self.DIR, 'product.dic'))
        self.save_dict(bug_severities, os.path.join(self.DIR, 'bug_severity.dic'))
    self.save_dict(priorities, os.path.join(self.DIR, 'priority.dic'))
    self.save_dict(versions, os.path.join(self.DIR, 'version.dic'))
    self.save_dict(components, os.path.join(self.DIR, 'component.dic'))
    self.save_dict(bug_statuses, os.path.join(self.DIR, 'bug_status.dic'))
    return text

  def processing_normalized_data(self, df, normalize_text):
    products = set()
    bug_severities = set()
    priorities = set()
    versions = set()
    components = set()
    bug_statuses = set()
    text = []
    normalized_bugs_json = []
    with tqdm(total=df.shape[0]) as loop:
      for row in df.iterrows():
          bug = row[1]
          if self.BASE != 'firefox':
            products.add(bug['product'])
            bug_severities.add(bug['bug_severity'])
          priorities.add(bug['priority'])
          versions.add(bug['version'])
          components.add(bug['component'])
          bug_statuses.add(bug['bug_status'])
          
          if 'description' not in bug or bug['description'] == '':
              bug['description'] = bug['title']

          if 'title' not in bug or bug['title'] == '':
              bug['title'] = bug['description']
          
          if self.PREPROCESSING == 'bert':
            description = normalize_text(bug['description'])
            bug['description_original'] = bug['description']
            bug['description'] = description
            title = normalize_text(bug['title'])
            bug['title_original'] = bug['title']
            bug['title'] = title
          else:
            bug['description'] = normalize_text(bug['description'])
            bug['title'] = normalize_text(bug['title'])
               
          normalized_bugs_json.append('{}\n'.format(bug.to_json()))

          text.append(bug['description'])
          text.append(bug['title'])
          loop.update(1)

    return [products, bug_severities, priorities, versions, components, bug_statuses, text, normalized_bugs_json]

  def build_vocabulary(self, train_text, MAX_NB_WORDS):
    word_freq = self.build_freq_dict(train_text)
    print('word vocabulary')
    word_vocab = self.save_vocab(word_freq, MAX_NB_WORDS, 'word_vocab_bert.pkl')
    return word_vocab

  def build_freq_dict(self, train_text):
    print('building frequency dictionaries')
    word_freq = defaultdict(int)
    for text in tqdm(train_text):
      for word in text.split():
        word_freq[word] += 1
    return word_freq

  def save_vocab(self, freq_dict, vocab_size, filename):
    top_tokens = sorted(freq_dict.items(), key=lambda x: -x[1])[:vocab_size - 2]
    print('most common token is %s which appears %d times' % (top_tokens[0][0], top_tokens[0][1]))
    print('less common token is %s which appears %d times' % (top_tokens[-1][0], top_tokens[-1][1]))
    vocab = {}
    i = 2  # 0-index is for padding, 1-index is for UNKNOWN
    for j in range(len(top_tokens)):
      vocab[top_tokens[j][0]] = i
      i += 1
    with open(os.path.join(self.DIR, filename), 'wb') as f:
      pickle.dump(vocab, f)
    return vocab

  def load_vocab(self, filename):
      with open(os.path.join(self.DIR, filename), 'rb') as f:
          return pickle.load(f)

  def dump_bugs(self, word_vocab, total):
      bug_dir = os.path.join(self.DIR, 'bugs')
      if not os.path.exists(bug_dir):
          os.mkdir(bug_dir)
      bugs = []
      print("Reading the normalized_bugs.json ...")
      if self.BASE != 'firefox':
        product_dict = self.load_dict(os.path.join(self.DIR,'product.dic'))
        bug_severity_dict = self.load_dict(os.path.join(self.DIR,'bug_severity.dic'))
      priority_dict = self.load_dict(os.path.join(self.DIR,'priority.dic'))
      version_dict = self.load_dict(os.path.join(self.DIR,'version.dic'))
      component_dict = self.load_dict(os.path.join(self.DIR,'component.dic'))
      bug_status_dict = self.load_dict(os.path.join(self.DIR,'bug_status.dic'))

      with open(os.path.join(self.DIR, 'normalized_bugs.json'), 'r') as f:
          #loop = tqdm(f)
          with tqdm(total=total) as loop:
              for line in f:
                  bug = json.loads(line)
                  if self.BASE != 'firefox':
                    bug['product'] = product_dict[bug['product']]
                    bug['bug_severity'] = bug_severity_dict[bug['bug_severity']]
                  bug['priority'] = priority_dict[bug['priority']]
                  bug['version'] = version_dict[bug['version']]
                  bug['component'] = component_dict[bug['component']]
                  bug['bug_status'] = bug_status_dict[bug['bug_status']]
                  bugs.append(bug)
                  loop.update(1)

      return bugs

  def dump_vocabulary(self, bugs, word_vocab, bug_dir):
      UNK = 1
      cont=0
      total = len(bugs)
      print("Starting the dump ...")
      bugs_set = {}
      bugs_saved = []
      for bug in tqdm(bugs):
          #bug = json.loads(line)
          #print(bug)
          cont+=1
          if self.PREPROCESSING == 'bert':
            ids, segments = self.tokenizer.encode('' if bug['description_original'] == None else bug['description_original'], max_len=self.MAX_SEQUENCE_LENGTH_D)
            bug['description_token'] = ids
            bug['description_segment'] = segments
            ids, segments = self.tokenizer.encode('' if bug['title_original'] == None else bug['title_original'], max_len=self.MAX_SEQUENCE_LENGTH_T)
            bug['title_token'] = ids
            bug['title_segment'] = segments
            bug.pop('description_original')
            bug.pop('title_original')
          else: # BASELINE
            bug['description_token'] = [word_vocab.get(w.encode('utf-8'), UNK) for w in bug['description'].split()]
            if len(bug['title']) == 0:
                bug['title'] = bug['description'][:10]
            bug['title_token'] = [word_vocab.get(w.encode('utf-8'), UNK) for w in bug['title'].split()]
          # Save the bug processed
          bugs_set[bug['issue_id']] = bug
          with open(os.path.join(bug_dir, str(bug['issue_id']) + '.pkl'), 'wb') as f:
              pickle.dump(bug, f)
          bugs_saved.append(bug['issue_id'])

      return [bugs_set, bugs_saved]

  def paralelize_processing(self, bugs, callback, parameters):
      cpu = os.cpu_count() - 1
      pool = Pool(processes=cpu) # start N worker processes
      works = []
      n = len(bugs) // cpu
      n = 1 if n == 0 else n
      sliced = []
      pos_end = n
      end = len(bugs)
      for i in range(cpu):
          pos_end = end if pos_end>=end else pos_end
          pos_end = end if (i+1) == cpu and pos_end < end else pos_end
          sliced.append(bugs[i*n:pos_end])
          pos_end += n

      print("Slicing in {} workers".format(len(sliced)))
      for s in sliced:
          if len(s) > 0:
              config = list(parameters)
              config.insert(0, s)
              config = tuple(config)
              works.append(pool.apply_async(callback, config))
              #dump_vocabulary(s, bug_dir)

      print("Executing the works...")
      res = [w.get() for w in works]
      return res

  def processing_dump(self, bugs, word_vocab, bugs_id, bugs_id_dataset):
      #clear_output()
      bug_dir = os.path.join(self.DIR, 'bugs')
      res = self.paralelize_processing(bugs, self.dump_vocabulary, (word_vocab, bug_dir, ))
      for result in res:
        bugs_set = result[0]
        bugs_saved = result[1]
        for bug in bugs_set:
          self.bugs[bug] = bugs_set[bug]
        self.bugs_saved += bugs_saved
      #self.dump_vocabulary(bugs, word_vocab, bug_dir)

      self.validing_bugs_id(bugs_id, bugs_id_dataset)

      print("All done!")

  def validing_bugs_id(self, bugs_id, bugs_id_dataset):
      print("Check if all bugs id regirested in the pairs exist in dataset")
      bugs_invalid = set(bugs_id) - set(bugs_id_dataset)
      bugs_id_dataset = set(bugs_id_dataset) - bugs_invalid
      bugs_id_dataset = sorted(bugs_id_dataset)
      with open(os.path.join(self.DIR, 'bug_ids.txt'), 'w') as f:
        for bug_id in bugs_id_dataset:
          f.write("%d\n" % bug_id)
      print("Bugs not present in dataset: ", list(bugs_invalid))
      bug_pairs = []
      with open(os.path.join(self.DIR, '{}.txt'.format(self.TRAIN_PATH)), 'r') as f:
          for line in f:
              bug1, bug2 = line.strip().split()
              if bug1 not in bugs_invalid and bug2 not in bugs_invalid:
                bug_pairs.append([bug1, bug2])
      with open(os.path.join(self.DIR, '{}.txt'.format(self.TRAIN_PATH)), 'w') as f:
          for pairs in bug_pairs:
              f.write("{} {}\n".format(pairs[0], pairs[1]))

  def create_bucket(self, df):
    print("Creating the buckets...")
    buckets = {}
    G=nx.Graph()
    for row in tqdm(df.iterrows()):
        bug_id = row[1]['issue_id']
        dup_id = row[1]['dup_id']
        if dup_id == '[]':
            G.add_node(bug_id)
        else:
            G.add_edges_from([(int(bug_id), int(dup_id))])
    for g in tqdm(nx.connected_components(G)):
        group = set(g)
        for bug in g:
            master = int(bug)
            query = df[df['issue_id'] == master]
            if query.shape[0] <= 0:
                group.remove(master)
                master = np.random.choice(list(group), 1)
        buckets[int(master)] = group
    return buckets

  def getting_pairs(self, array):
      res = []
      bug_ids = set()
      for row in array:
          dup_bucket, dups = row
          bug_ids.add(dup_bucket)
          dups = list(dups)
          while len(dups) > 1:
              bucket = dups[0]
              bug_ids.add(bucket)
              dups.remove(bucket)
              for d in dups:
                  bug_ids.add(d)
                  res.append([bucket, d])
      return res, bug_ids    
  
  def run(self):
    
      # create 'dataset' directory
      bug_dir = os.path.join(self.DIR, self.DATASET)
      if not os.path.exists(bug_dir):
          os.mkdir(bug_dir)

      # create 'processing' directory
      bug_dir = os.path.join(bug_dir, self.PREPROCESSING)
      if not os.path.exists(bug_dir):
          os.mkdir(bug_dir)

      normalized = os.path.join('{}data/normalized'.format(self.COLAB), self.DATASET)

      self.BASE = self.DOMAIN
      self.DIR = bug_dir
      self.DOMAIN = os.path.join(normalized, self.DOMAIN)
      self.PAIRS = os.path.join(normalized, self.PAIRS)
      
      # Train
      df_train = pd.read_csv('{}.csv'.format(self.DOMAIN))
      if self.BASE != 'firefox':
        df_train.columns = ['issue_id','bug_severity','bug_status','component',
                            'creation_ts','delta_ts','description','dup_id','priority',
                            'product','resolution','title','version']
      else:
        df_train.columns = ['issue_id','priority','component','dup_id','title',
                                'description','bug_status','resolution','version',
                                    'creation_ts', 'delta_ts']
      ### Pairs
      #df_train_pair = pd.read_csv('{}.csv'.format(self.PAIRS))

      bug_pairs, bug_ids = self.read_pairs(df_train)
      bugs_id_dataset = df_train['issue_id'].values
      print("Number of bugs: {}".format(len(bug_ids)))
      print("Number of pairs: {}".format(len(bug_pairs)))

      # Split into train/test
      self.split_train_test(bug_pairs, self.VALIDATION_SPLIT)

      # Debug
      # test  = [14785, 24843, 32367, 33529]
      # df_train = df_train[df_train['issue_id'].isin(test)]

      # Normalize the text
      text = self.normalized_data(bug_ids, df_train)
      # Build the vocab
      word_vocab = self.build_vocabulary(text, self.MAX_NB_WORDS)
      
      # Dump the preprocessed bugs
      num_lines =  len(open(os.path.join(self.DIR, 'normalized_bugs.json'), 'r').read().splitlines()) * 2
      total = num_lines // 2
      bugs = self.dump_bugs(word_vocab, total)
      self.processing_dump(bugs, word_vocab, bug_ids, bugs_id_dataset)
      print("Saved!")