Exemplo n.º 1
0
    def run(self):
        #set enviornment
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(self._gpuid)

        #load models
        #every worker only need to load model one time
        paths = get_checkpoint_paths(self._bert_checkpoint)
        model = load_trained_model_from_checkpoint(
            config_file=paths.config,
            checkpoint_file=paths.checkpoint,
            output_layer_num=1,
        )
        vocabs = load_vocabulary(paths.vocab)
        print('model init done', self._gpuid)

        while True:
            xfile = self._queue.get()
            if xfile == None:
                self._queue.put(None)
                break
            embeddings = extract_embeddings(model=model,
                                            vocabs=vocabs,
                                            texts=xfile[1],
                                            output_layer_num=1,
                                            poolings=[POOL_NSP, POOL_MAX])
            print('woker running', self._gpuid, len(self.return_list))
            self.return_list.append({
                'worker': self._gpuid,
                'id': xfile[0],
                'content': xfile[1],
                'embeddings': embeddings
            })

        print('worker predict done at gpu:', self._gpuid)
def construct_model2(paths, use_multi_gpus=True):
    token_dict = load_vocabulary(paths.vocab)
    tokenizer = SimpleTokenizer(token_dict)

    bert_model = load_trained_model_from_checkpoint(paths.config,
                                                    paths.checkpoint,
                                                    seq_len=None)
    for l in bert_model.layers:
        l.trainable = True

    x1_in = Input(shape=(None, ), name='input_x1', dtype='int32')
    x2_in = Input(shape=(None, ), name='input_x2')
    x = bert_model([x1_in, x2_in])
    x_cls = Lambda(lambda x: x[:, 0])(x)
    y_pred = Dense(1, activation='sigmoid', name='output_similarity')(x_cls)

    model = Model([x1_in, x2_in], y_pred)
    if use_multi_gpus:
        print('using multi-gpus')
        model = multi_gpu_model(model, gpus=2)

    model.compile(loss={'output_similarity': 'binary_crossentropy'},
                  optimizer=Adam(1e-5),
                  metrics={'output_similarity': 'accuracy'})

    return model, tokenizer
 def init_bert(self,config):
     bert_config = config['bert']['config_json']
     bert_checkpoint = config['bert']['bert_ckpt']
     bert_vocab = config['bert']['bert_vocab']
     bert_vocabs = load_vocabulary(bert_vocab)
     self.bert_token = Tokenizer(bert_vocabs)
     self.bert = self.load_bert(bert_config, bert_checkpoint)
Exemplo n.º 4
0
def construct_model(paths, use_multi_gpus=False):
    token_dict = load_vocabulary(paths.vocab)
    tokenizer = SimpleTokenizer(token_dict)

    bert_model = load_trained_model_from_checkpoint(paths.config,
                                                    paths.checkpoint,
                                                    seq_len=None)
    for l in bert_model.layers:
        l.trainable = True

    # x1是QuestionCondPair的question字段和cond_text字段的拼接。x2是拼接的segment_ids。x2的长度和x1一样。在x2中,对应于x1中question的位置为0,对应于x1中cond_text的位置为1
    # (question是查询文本,cond_text是拼凑出来的查询条件的文本形式,如“影片名称是密室逃生”)
    # y是“cond_text是question中包含的查询条件“的概率
    # x1、x2、y都在QuestionCondPairsDataseq类的__getitem__方法中构造
    x1_in = Input(shape=(None, ), name='input_x1', dtype='int32')
    x2_in = Input(shape=(None, ), name='input_x2')
    x = bert_model([x1_in, x2_in])
    x_cls = Lambda(lambda x: x[:, 0])(x)  # 取bert输出序列的第1个元素
    y_pred = Dense(1, activation='sigmoid', name='output_similarity')(x_cls)

    model = Model([x1_in, x2_in], y_pred)
    if use_multi_gpus:
        print('using multi-gpus')
        model = multi_gpu_model(model, gpus=2)

    model.compile(loss={'output_similarity': 'binary_crossentropy'},
                  optimizer=Adam(1e-5),
                  metrics={'output_similarity': 'accuracy'})

    return model, tokenizer
Exemplo n.º 5
0
def load_task2_testX(dict_path, data_dir):
    if not os.path.exists(os.path.join(
            data_dir, 'task2_testX.npy')) or not os.path.exists(
                os.path.join(data_dir, 'task2_test_seg.npy')):
        df = pd.read_csv(os.path.join(data_dir, 'task2_public_testset.csv'),
                         dtype=str)
        abstract = df.values[:, 2]

        # collect words
        token_dict = load_vocabulary(dict_path)
        tokenizer = Tokenizer(token_dict)
        input_data = []
        input_seg = []
        seq_len = 512  # maximum should be 638, while bert-BASE only support up to 512
        for i in tqdm(abstract):
            j = i.replace('$$$', ' ')
            idx, seg = tokenizer.encode(j, max_len=seq_len)
            input_data.append(idx)
            input_seg.append(seg)
        X = np.asarray(input_data)
        seg = np.asarray(input_seg)

        np.save(os.path.join(data_dir, 'task2_testX.npy'), X)
        np.save(os.path.join(data_dir, 'task2_test_seg.npy'), seg)
    else:
        X, seg = np.load(os.path.join(data_dir, 'task2_testX.npy')), np.load(
            os.path.join(data_dir, 'task2_test_seg.npy'))
    return X, seg
Exemplo n.º 6
0
 def init_all(self, config):
     if self.train:
         bert_config = config['bert']['config_json']
         bert_checkpoint = config['bert']['bert_ckpt']
         bert_vocab = config['bert']['bert_vocab']
         bert_vocabs = load_vocabulary(bert_vocab)
         self.bert_token = Tokenizer(bert_vocabs)
         self.bert = self.init_bert(bert_config, bert_checkpoint)
     self.get_sentence(config['train_list'] if self.train else config['eval_list'], training=self.train)
Exemplo n.º 7
0
  def tokenizer_init(self):
    pretrained_path = 'uncased_L-12_H-768_A-12'
    config_path = os.path.join(pretrained_path, 'bert_config.json')
    model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
    vocab_path = os.path.join(pretrained_path, 'vocab.txt')

    token_dict = load_vocabulary(vocab_path)
    print("Total vocabulary loaded: {}".format(len(token_dict)))

    self.tokenizer = Tokenizer(token_dict)
Exemplo n.º 8
0
 def load_model(self):
     tf.keras.backend.clear_session()
     logging.info("Loading RuBERT model...")
     paths = get_checkpoint_paths("model_bert")
     inputs = load_trained_model_from_checkpoint(
         config_file=paths.config,
         checkpoint_file=paths.checkpoint, seq_len=50)
     outputs = MaskedGlobalMaxPool1D(name="Pooling")(inputs.output)
     vocab = load_vocabulary(paths.vocab)
     return tf.keras.Model(inputs=inputs.inputs,
                           outputs=outputs), vocab, Tokenizer(vocab)
Exemplo n.º 9
0
def load_pretrained(options):
    model = load_trained_model_from_checkpoint(
        options.bert_config_file,
        options.init_checkpoint,
        training=False,
        trainable=True,
        seq_len=options.max_seq_length,
    )
    vocab = load_vocabulary(options.vocab_file)
    print('vocab size', len(vocab))
    return model, vocab
Exemplo n.º 10
0
def pretrain_model():

    df = pd.read_csv('../data/task2_trainset.csv', dtype=str)
    df_2 = pd.read_csv('../data/task2_public_testset.csv', dtype=str)
    abstract_1 = df.values[:, 2]
    abstract_2 = df_2.values[:, 2]
    token_dict = load_vocabulary(dict_path)
    token_list = list(token_dict.keys())
    tokenizer = Tokenizer(token_dict)
    X_1 = collect_inputs(abstract_1, tokenizer)
    X_2 = collect_inputs(abstract_2, tokenizer)
    X = X_1 + X_2
    print(len(X))

    model = load_trained_model_from_checkpoint(config_path,
                                               checkpoint_path,
                                               training=True,
                                               trainable=True,
                                               seq_len=512)
    compile_model(model)

    def _generator():
        while True:
            yield gen_batch_inputs(generate_input_by_batch(X),
                                   token_dict,
                                   token_list,
                                   seq_len=512,
                                   mask_rate=0.3)

    opt_filepath = sys.argv[1]
    checkpoint = ModelCheckpoint(opt_filepath,
                                 monitor='val_loss',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='min',
                                 save_weights_only=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                  factor=0.5,
                                  patience=10,
                                  verbose=1,
                                  mode='auto',
                                  min_delta=0.1,
                                  cooldown=10,
                                  min_lr=1e-10)
    es = EarlyStopping(monitor='val_loss', patience=50)
    callbacks_list = [checkpoint, es, reduce_lr]

    model.fit_generator(generator=_generator(),
                        steps_per_epoch=500,
                        epochs=5000,
                        validation_data=_generator(),
                        validation_steps=200,
                        callbacks=callbacks_list)
Exemplo n.º 11
0
 def __init__(self, docs, vec):
     self.texts = np.array(docs)
     self.vec = vec
     paths = get_checkpoint_paths(".")
     inputs = load_trained_model_from_checkpoint(
         config_file=paths.config,
         checkpoint_file=paths.checkpoint,
         seq_len=50)
     outputs = MaskedGlobalMaxPool1D(name='Pooling')(inputs.output)
     self.model = Model(inputs=inputs.inputs, outputs=outputs)
     self.vocab = load_vocabulary(paths.vocab)
     self.tokenizer = Tokenizer(self.vocab)
Exemplo n.º 12
0
    def __init__(self):
        import keras
        import keras_bert
        from driver_amount import addh

        bert_model, bert_model_config = keras_bert.build_model_from_config(
            addh + config.BERT_CONFIG_PATH,
            trainable=False
        )
        output = bert_model.get_layer("Embedding-Norm").output

        self.model = keras.models.Model(bert_model.input, output)
        self.model.load_weights(addh + config.MODEL_PATH, by_name=True)
        self.vocab = keras_bert.load_vocabulary(addh + config.BERT_VOCAB_PATH)
Exemplo n.º 13
0
    def __init__(self, config):
        model_path = config["model_path"]
        if not os.path.exists(model_path):
            model_dir = os.path.dirname(model_path)
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            subprocess.run(
                f"wget -P {model_dir} {MODEL_URL} && cd {model_dir} && unzip chinese_wwm_L-12_H-768_A-12.zip",
                shell=True)

        paths = get_checkpoint_paths(model_path)
        self.model = load_trained_model_from_checkpoint(
            config_file=paths.config,
            checkpoint_file=paths.checkpoint,
            output_layer_num=1)
        self.vocabs = load_vocabulary(paths.vocab)
Exemplo n.º 14
0
def load_task2_trainXY(dict_path, data_dir):
    if not os.path.exists(os.path.join(
            data_dir, 'task2_trainX.npy')) or not os.path.exists(
                os.path.join(
                    data_dir, 'task2_trainY.npy')) or not os.path.exists(
                        os.path.join(data_dir, 'task2_train_seg.npy')):
        df = pd.read_csv(os.path.join(data_dir, 'task2_trainset.csv'),
                         dtype=str)
        cate = df.values[:, -1]

        # generating Y
        Y = np.zeros((cate.shape[0], 4))
        name = {
            'THEORETICAL': 0,
            'ENGINEERING': 1,
            'EMPIRICAL': 2,
            'OTHERS': 3
        }
        for i in range(cate.shape[0]):
            for c in cate[i].split(' '):
                Y[i, name[c]] += 1

        # generating X
        abstract = df.values[:, 2]

        # collect words
        token_dict = load_vocabulary(dict_path)
        tokenizer = Tokenizer(token_dict)
        input_data = []
        input_seg = []
        for i in tqdm(abstract):
            j = i.replace('$$$', ' ')
            idx, seg = tokenizer.encode(j, max_len=512)
            input_data.append(idx)
            input_seg.append(seg)
        X = np.array(input_data)
        seg = np.array(input_seg)
        np.save(os.path.join(data_dir, 'task2_trainX.npy'), X)
        np.save(os.path.join(data_dir, 'task2_trainY.npy'), Y)
        np.save(os.path.join(data_dir, 'task2_train_seg.npy'), seg)
    else:
        X, Y, seg = np.load(os.path.join(
            data_dir, 'task2_trainX.npy')), np.load(
                os.path.join(data_dir, 'task2_trainY.npy')), np.load(
                    os.path.join(data_dir, 'task2_train_seg.npy'))
    return X, Y, seg
Exemplo n.º 15
0
 def __init__(self, batch_size, gpu_num, gpu_name):
     gpu_option(gpu_name, gpu_num)
     self.batch_size = batch_size
     print("##### load KerasBERT start #####")
     # Path
     model_path = 'models/BERT/pretrained_model/uncased_L-24_H-1024_A-16'
     config_path = os.path.join(model_path, 'bert_config.json')
     checkpoint_path = os.path.join(model_path, 'bert_model.ckpt')
     vocab_path = os.path.join(model_path, 'vocab.txt')
     token_dict = load_vocabulary(vocab_path)
     model = load_trained_model_from_checkpoint(config_path,
                                                checkpoint_path)
     if gpu_num >= 2:
         self.par_model = multi_gpu_model(model, gpus=gpu_num)
     else:
         self.par_model = model
     self.tokenizer = Tokenizer(token_dict)
     print("##### load KerasBERT end #####")
Exemplo n.º 16
0
 def __init__(self, gpu_name, gpu_num, seq_max_len, batch_size):
     print('--' * 10 + ' Load BERT model start ' + '--' * 10)
     gpu_option(gpu_name, gpu_num)
     self.seq_max_len = seq_max_len  # same to train
     self.batch_size = batch_size
     model_path = 'models/BERT/pretrained_model/uncased_L-24_H-1024_A-16'
     vocab_path = os.path.join(model_path, 'vocab.txt')
     # load Tokenizer
     token_dict = load_vocabulary(vocab_path)
     self.tokenizer = Tokenizer(token_dict)
     MODEL_SAVE_PATH = 'models/BERT/fine_tune_model/bert_fine_tune.hdf5'
     model = load_model(MODEL_SAVE_PATH,
                        custom_objects=get_custom_objects(),
                        compile=False)
     if gpu_num >= 2:
         self.par_model = multi_gpu_model(model, gpus=gpu_num)
     else:
         self.par_model = model
     print('--' * 10 + ' Load BERT model end ' + '--' * 10)
Exemplo n.º 17
0
def preprocess(char_seqs,
               tag_seqs,
               vocab_file,
               SEQ_LEN=512,
               cased=True,
               tag_vocab=None,
               TAG_PAD=''):
    from keras_bert import load_vocabulary

    # Load vocab
    vocab = load_vocabulary(vocab_file)

    # preprocess char_seqs
    token_id_seqs = preprocess_char(char_seqs, vocab, SEQ_LEN, cased)

    # create segment_seqs
    segment_seqs = create_segment(len(token_id_seqs), len(token_id_seqs[0]))

    # preprocess tag_seqs
    one_hot_tag_id_seqs, tag_vocab = preprocess_tag(tag_seqs, SEQ_LEN,
                                                    tag_vocab, TAG_PAD)

    return token_id_seqs, segment_seqs, one_hot_tag_id_seqs, tag_vocab
Exemplo n.º 18
0
            data.append([row[0], row[1], int(row[2])])
    return data


# load data, generate train data and valid data
all_data = read_data(data_path)
valid_num = int(len(all_data) * valid_data_ratio)
train_num = len(all_data) - valid_num
train_data = all_data[:train_num]
valid_data = all_data[train_num:]
print('data number:', len(all_data))
print('train data number:', len(train_data))
print('valid data number:', len(valid_data))

# load Tokenizer
token_dict = load_vocabulary(vocab_path)
tokenizer = Tokenizer(token_dict)


class data_generator:
    def __init__(self, data, batch_size):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def __iter__(self):
Exemplo n.º 19
0
)

if len(sys.argv) == 2:
    model_path = sys.argv[1]
else:
    from keras_bert.datasets import get_pretrained, PretrainedList
    model_path = get_pretrained(PretrainedList.chinese_base)

paths = get_checkpoint_paths(model_path)

model = load_trained_model_from_checkpoint(paths.config,
                                           paths.checkpoint,
                                           seq_len=10)
model.summary(line_length=120)

token_dict = load_vocabulary(paths.vocab)

tokenizer = Tokenizer(token_dict)
text = '语言模型'
tokens = tokenizer.tokenize(text)
print('Tokens:', tokens)
indices, segments = tokenizer.encode(first=text, max_len=10)

predicts = model.predict([np.array([indices]), np.array([segments])])[0]
for i, token in enumerate(tokens):
    print(token, predicts[i].tolist()[:5])
"""Official outputs:
{
  "linex_index": 0,
  "features": [
    {
Exemplo n.º 20
0
if len(sys.argv) != 2:
    print('python load_model.py UNZIPPED_MODEL_PATH')
    sys.exit(-1)

print(
    'This demo demonstrates how to load the pre-trained model and extract the sentence embedding with pooling.'
)

model_path = sys.argv[1]
config_path = os.path.join(model_path, 'bert_config.json')
checkpoint_path = os.path.join(model_path, 'bert_model.ckpt')
dict_path = os.path.join(model_path, 'vocab.txt')

model = load_trained_model_from_checkpoint(config_path,
                                           checkpoint_path,
                                           seq_len=10)
pool_layer = MaskedGlobalMaxPool1D(name='Pooling')(model.output)
model = keras.models.Model(inputs=model.inputs, outputs=pool_layer)
model.summary(line_length=120)

token_dict = load_vocabulary(dict_path)

tokenizer = Tokenizer(token_dict)
text = '语言模型'
tokens = tokenizer.tokenize(text)
print('Tokens:', tokens)
indices, segments = tokenizer.encode(first=text, max_len=10)

predicts = model.predict([np.array([indices]), np.array([segments])])[0]
print('Pooled:', predicts.tolist()[:5])
Exemplo n.º 21
0
z_labels = to_categorical(z_labels, num_classes=num_classes)
# 数据格式转换成模型需要的输入格式
train_data = [
    np.array([x_data[i].replace(' ', ''), x_labels[i]])
    for i in range(len(x_labels))
]
valid_data = [
    np.array([y_data[i].replace(' ', ''), y_labels[i]])
    for i in range(len(y_labels))
]
test_data = [
    np.array([z_data[i].replace(' ', ''), z_labels[i]])
    for i in range(len(z_labels))
]
# 读取字典
token_dict = load_vocabulary(BertConfig.dict_path)


#重写tokenizer
class OurTokenizer(Tokenizer):
    def _tokenize(self, text):
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]')  # 用[unused1]来表示空格类字符
            else:
                R.append('[UNK]')  # 不在列表的字符用[UNK]表示
        return R
Exemplo n.º 22
0
 def test_load_vocabulary(self):
     current_path = os.path.dirname(os.path.abspath(__file__))
     vocab_path = os.path.join(current_path, 'test_checkpoint', 'vocab.txt')
     token_dict = load_vocabulary(vocab_path)
     self.assertEqual(15, len(token_dict))
Exemplo n.º 23
0
def train(dataset='weibo'):
    if dataset == 'weibo':
        pretrained_path = '/XXXX/corpus/chinese_L-12_H-768_A-12'  # for Chinese in weibo
    elif dataset == 'Twitter':
        pretrained_path = '/XXXX/corpus/uncased_L-12_H-768_A-12'  # for English in Twitter
    else:
        raise ValueError('ERROR! dataset must be weibo or Twitter!')
    config_path = '{}/bert_config.json'.format(pretrained_path)
    checkpoint_path = '{}/bert_model.ckpt'.format(pretrained_path)
    vocab_path = '{}/vocab.txt'.format(pretrained_path)
    token_dict = load_vocabulary(vocab_path)
    tokenizer = Tokenizer(token_dict)

    model = load_trained_model_from_checkpoint(config_path, checkpoint_path)
    model.summary(line_length=120)

    ############################################
    # get formated data from different dataset
    if dataset == 'weibo':
        matrix_save_dir = './weibo_dataset'
        train_t_m, train_i_m, train_l, train_el = get_weibo_matrix(
            'train', tokenizer)
        test_t_m, test_i_m, test_l, test_el = get_weibo_matrix(
            'test', tokenizer)
    else:
        matrix_save_dir = './Twitter_dataset'
        train_t_m, train_i_m, train_l = get_twitter_matrix('train', tokenizer)
        test_t_m, test_i_m, test_l = get_twitter_matrix('test', tokenizer)

    train_text_matrix = []
    for b in tqdm.tqdm(train_t_m):
        results = model.predict([b, np.array([0 for i in range(seq_len)])])[0]
        train_text_matrix.append(results)
    train_text_matrix = np.array(train_text_matrix)

    test_text_matrix = []
    for b in tqdm.tqdm(test_t_m):
        b = np.expand_dims(np.array(b), axis=0)
        results = model.predict([b, np.array([0 for i in range(seq_len)])])[0]
        test_text_matrix.append(results)
    test_text_matrix = np.array(test_text_matrix)

    train_t_m = np.array(train_t_m)
    train_i_m = np.array(train_i_m)
    train_l = np.array(train_l)
    test_t_m = np.array(test_t_m)
    test_i_m = np.array(test_i_m)
    test_l = np.array(test_l)
    print('4. train text:', train_t_m.shape)
    print('train text emb:', train_text_matrix.shape)
    print('train image emb:', train_i_m.shape)
    print('train label emb:', train_l.shape)
    print('5. test text:', test_t_m.shape)
    print('test text emb:', test_text_matrix.shape)
    print('test image emb:', test_i_m.shape)
    print('test labels emb:', test_l.shape)

    np.save('{}/train_text'.format(matrix_save_dir), train_t_m)
    np.save('{}/train_text_embed'.format(matrix_save_dir), train_text_matrix)
    np.save('{}/train_image_embed'.format(matrix_save_dir), train_i_m)
    np.save('{}/train_label'.format(matrix_save_dir), train_l)

    np.save('{}/test_text'.format(matrix_save_dir), test_t_m)
    np.save('{}/test_text_embed'.format(matrix_save_dir), test_text_matrix)
    np.save('{}/test_image_embed'.format(matrix_save_dir), test_i_m)
    np.save('{}/test_label'.format(matrix_save_dir), test_l)
Exemplo n.º 24
0
    def encode(self, query: Query, col_orders=None):
        tokens, tokens_lens = self.tokenize(query, col_orders)
        #['[CLS]', '我', '想', '你', '帮', '我', '查', '一', '下', '第', '四', '周', '大', '黄', '蜂', ',', '还', '有', '密', '室', '逃', '生', '这', '两', '部', '电', '影', '票', '房', '的', '占', '比', '加', '起', '来', '会', '是', '多', '少', '来', '着', '[SEP]', '[unused11]', '影', '片', '名', '称', '[SEP]', '[unused12]', '周', '票', '房', '[SEP]', '[unused12]', '票', '房', '占', '比', '[SEP]', '[unused12]', '场', '均', '人', '次', '[SEP]']
        # tokens_lens :[42, 6, 5, 6, 6]  ?  什么意思?
        token_ids = self._convert_tokens_to_ids(
            tokens
        )  # token_ids = self._convert_tokens_to_ids(tokens),将所有汉字用语料库中的id表示
        segment_ids = [0] * len(token_ids)  # 为啥只是简单的相乘?
        #[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        header_indices = np.cumsum(
            tokens_lens
        )  # axis=0,按照行累加,axis=1,按照列累加。axis不给定具体值,就把numpy数组当成一个一维数组。
        return token_ids, segment_ids, header_indices[:-1]


token_dict = load_vocabulary(paths.vocab)  # 此处加载了语料库!!!
query_tokenizer = QueryTokenizer(token_dict)

print('QueryTokenizer\n')
print('Input Question:\n{}\n'.format(sample_query.question))
print('Input Header:\n{}\n'.format(sample_query.table.header))
print('Output Tokens:\n{}\n'.format(' '.join(
    query_tokenizer.tokenize(sample_query)[0])))
print('Output token_ids:\n{}\nOutput segment_ids:\n{}\nOutput header_ids:\n{}'.
      format(*query_tokenizer.encode(sample_query)))


class SqlLabelEncoder:
    """
    Convert SQL object into training labels.
    """
Exemplo n.º 25
0
                   '--BERT_MODEL',
                   default=CONFIG.bert_model,
                   help='bert模型ckpt')
parse.add_argument('-v', '--VOCAB', default=CONFIG.vocab, help='bert词汇表')
parse.add_argument('-m',
                   '--MODEL_PATH',
                   default=CONFIG.bert_model,
                   help='模型保存路径')
parse.add_argument('-lg',
                   '--LOG_PATH',
                   default=CONFIG.bert_model,
                   help='训练日志路径')
args = parse.parse_args()

# 加载bert分词器
token_dict = load_vocabulary(vocab_path=args.VOCAB)
tokenizer = Tokenizer(token_dict=token_dict)


def data_padding(data, padding=0):
    """
    数据padding
    :param data:
    :param padding:
    :return:
    """
    data_len = [len(d) for d in data]
    M_L = max(data_len)
    return np.array([
        np.concatenate([d, (M_L - len(d)) * [padding]]) if len(d) < M_L else d
        for d in data
Exemplo n.º 26
0
def pretrain_model(opt_filepath, data_dir, gpu_id):
    os.environ['CUDA_VISIBLE_DEVICES'] = gpu_id
    #gpus = tf.config.experimental.list_physical_devices('GPU')
    #tf.config.experimental.set_memory_growth(gpus[0], True)

    token_dict = load_vocabulary(dict_path)
    token_list = list(token_dict.keys())
    #if not os.path.exists(os.path.join(data_dir, 'pretrain_X.npy')):
    df = pd.read_csv(os.path.join(data_dir, 'task2_trainset.csv'), dtype=str)
    df_2 = pd.read_csv(os.path.join(data_dir, 'task2_public_testset.csv'),
                       dtype=str)
    abstract_1 = df.values[:, 2]
    abstract_2 = df_2.values[:, 2]
    tokenizer = Tokenizer(token_dict)
    X_1 = collect_inputs(abstract_1, tokenizer)
    X_2 = collect_inputs(abstract_2, tokenizer)
    X = np.array(X_1 + X_2)
    #    np.save(os.path.join(data_dir, 'pretrain_X.npy'), X)
    #else:
    #    X = np.load(os.path.join(data_dir, 'pretrain_X.npy'))
    print(X.shape)

    model = load_trained_model_from_checkpoint(config_path,
                                               checkpoint_path,
                                               training=True,
                                               trainable=get_layers_name(
                                                   range(12, 25)),
                                               seq_len=512)
    compile_model(model)

    def _generator(batch_size=4):
        while True:
            idx = np.random.permutation(X.shape[0])
            for i in range(0, idx.shape[0], batch_size):
                yield gen_batch_inputs(X[i:i + batch_size],
                                       token_dict,
                                       token_list,
                                       seq_len=512,
                                       mask_rate=0.3)

    checkpoint = ModelCheckpoint(opt_filepath,
                                 monitor='val_loss',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='min',
                                 save_weights_only=True)

    trainable_layer = list(range(12 * 8, 19 * 8, 8))
    batch_size = [3] * 3 + [3] * 3
    for i, layer_i in enumerate(trainable_layer):
        for j, layer in enumerate(model.layers):
            if j >= layer_i:
                layer.trainable = True
                print(layer.name, layer.trainable)
            else:
                layer.trainable = False

        compile_model(model)
        if os.path.exists(opt_filepath):
            model.load_weights(opt_filepath)

        es = EarlyStopping(monitor='val_loss', patience=20)
        reduce_lr = ReduceLROnPlateau(factor=0.7,
                                      patience=4,
                                      verbose=1,
                                      min_lr=1e-6)
        callbacks_list = [checkpoint, es, reduce_lr]

        model.fit_generator(generator=_generator(batch_size[i]),
                            steps_per_epoch=500,
                            epochs=5000,
                            validation_data=_generator(),
                            validation_steps=200,
                            callbacks=callbacks_list)
Exemplo n.º 27
0
def main():
    args = get_args()

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    np.random.seed(args.seed)

    if args.verbose:
        log.basicConfig(level=log.DEBUG, stream=sys.stdout)
    else:
        log.basicConfig(level=log.INFO, stream=sys.stdout)

    log.info('\n' + tabulate(sorted(vars(args).items())))
    set_logger(os.path.join(args.output_dir, args.log_file))

    pick_device()
    data = load_instances(args.dataset, args.label_col)
    classes = list(sorted(set(data[args.label_col])))
    args.n_classes = len(classes)

    token_dict = load_vocabulary(args.vocab_file)
    tokenizer = Tokenizer(token_dict)

    if args.do_train:
        folds = [i for i in args.train_dataset.split(',')]
        train_df = data[data['fold'].isin(folds)].reset_index(drop=True)
        train_generator = TextDataFrameIterator(
            dataframe=train_df,
            tokenizer=tokenizer,
            classes=classes,
            x_col=args.text_col,
            y_col=args.label_col,
            batch_size=args.batch_size,
            shuffle=True,
            seq_len=args.max_seq_length,
            seed=args.seed,
            do_lower_case=args.do_lower_case
        )

        folds = [i for i in args.val_dataset.split(',')]
        val_df = data[data['fold'].isin(folds)].reset_index(drop=True)
        val_generator = TextDataFrameIterator(
            dataframe=val_df,
            tokenizer=tokenizer,
            classes=classes,
            x_col=args.text_col,
            y_col=args.label_col,
            batch_size=args.batch_size,
            shuffle=False,
            seq_len=args.max_seq_length,
            do_lower_case=args.do_lower_case
        )

        total_steps, warmup_steps = calc_train_steps(
            num_example=len(train_df),
            batch_size=args.batch_size,
            epochs=args.epochs,
            warmup_proportion=args.warmup_proportion,
        )

        model = get_model(args)
        earlystop = callbacks.EarlyStopping(
            monitor='val_loss', min_delta=K.epsilon(), patience=args.earlystop,
            verbose=1, mode='auto')
        best_checkpoint = callbacks.ModelCheckpoint(
            os.path.join(args.output_dir, args.best_model),
            save_best_only=True, save_weights_only=False,
            monitor='val_loss', mode='min', verbose=1)
        csv_logger = callbacks.CSVLogger(os.path.join(args.output_dir, args.csv_logger))

        callbacks_list = [earlystop, best_checkpoint, csv_logger]
        optimizer = AdamWarmup(
            decay_steps=total_steps,
            warmup_steps=warmup_steps,
            lr=args.learning_rate,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            min_lr=1e-5,
            weight_decay=0.01,
            weight_decay_pattern=['embeddings', 'kernel', 'W1', 'W2', 'Wk', 'Wq', 'Wv', 'Wo']
        )
        model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

        cw = get_class_weights(data, args.label_col, train_generator.class_indices)
        model.fit_generator(
            train_generator,
            class_weight=cw,
            use_multiprocessing=False,
            workers=args.workers,
            callbacks=callbacks_list,
            epochs=args.epochs,
            validation_data=val_generator,
            verbose=1)

    if args.do_test:
        folds = [i for i in args.test_dataset.split(',')]
        test_df = data[data['fold'].isin(folds)].reset_index(drop=True)
        test_generator = TextDataFrameIterator(
            dataframe=test_df,
            tokenizer=tokenizer,
            classes=classes,
            x_col=args.text_col,
            y_col=args.label_col,
            batch_size=args.batch_size,
            shuffle=False,
            seq_len=args.max_seq_length,
            do_lower_case=args.do_lower_case
        )

        print('Load from %s', os.path.join(args.output_dir, args.best_model))
        model = load_model(os.path.join(args.output_dir, args.best_model), custom_objects=get_custom_objects())
        # model.summary()
        y_score = model.predict_generator(
            test_generator,
            use_multiprocessing=False,
            workers=args.workers,
            verbose=1)

        y_pred = np.argmax(y_score, axis=1)

        pred_df = pd.DataFrame(y_score, columns=classes)
        pred_df = pred_df.assign(predictions=[classes[lbl] for lbl in y_pred])

        y_true = test_df.loc[:, args.label_col].values
        y_pred = pred_df['predictions'].values
        report = pmetrics.classification_report(y_true, y_pred, classes=classes)
        print(report.summary())
        # print('auc', pmetrics.auc(y_true, y_score, y_column=1)[0])

        result = pd.concat([test_df, pred_df], axis=1)
        result.to_csv(os.path.join(args.output_dir, args.test_predictions), index=False)

    if args.do_predict:
        test_df = load_instances(args.pred_dataset, args.label_col)
        test_generator = TextDataFrameIterator(
            dataframe=test_df,
            tokenizer=tokenizer,
            classes=None,
            x_col=args.text_col,
            y_col=args.label_col,
            batch_size=args.batch_size,
            shuffle=False,
            seq_len=args.max_seq_length,
            do_lower_case=args.do_lower_case
        )

        print('Load from %s', os.path.join(args.output_dir, args.best_model))
        model = load_model(os.path.join(args.output_dir, args.best_model), custom_objects=get_custom_objects())
        # model.summary()
        y_score = model.predict_generator(
            test_generator,
            use_multiprocessing=False,
            workers=args.workers,
            verbose=1)
        y_pred = np.argmax(y_score, axis=1)

        pred_df = pd.DataFrame(y_score, columns=classes)
        pred_df = pred_df.assign(predictions=[classes[lbl] for lbl in y_pred])
        result = pd.concat([test_df, pred_df], axis=1)
        result.to_csv(os.path.join(args.output_dir, args.pred_predictions), index=False)

    if args.do_debug:
        for dataset in [args.train_dataset, args.val_dataset, args.test_dataset]:
            folds = [i for i in dataset.split(',')]
            print('folds:', folds)
            sub_df = data[data['fold'].isin(folds)]
            generator = TextDataFrameIterator(
                dataframe=sub_df,
                tokenizer=tokenizer,
                x_col=args.text_col,
                y_col=args.label_col,
                batch_size=args.batch_size,
                shuffle=False,
                seq_len=args.max_seq_length,
            )
            for i, ([tokens, _], labels) in enumerate(generator):
                print(tokens.shape, type(tokens), labels.shape, type(labels))
                if i == 2:
                    break
def construct_model(paths, use_multi_gpus=True):
    token_dict = load_vocabulary(paths.vocab)
    query_tokenizer = QueryTokenizer(token_dict)

    num_sel_agg = len(SQL.agg_sql_dict) + 1
    num_cond_op = len(SQL.op_sql_dict) + 1
    num_cond_conn_op = len(SQL.conn_sql_dict)

    bert_model = load_trained_model_from_checkpoint(paths.config,
                                                    paths.checkpoint,
                                                    seq_len=None)
    for l in bert_model.layers:
        l.trainable = True

    inp_token_ids = Input(shape=(None, ),
                          name='input_token_ids',
                          dtype='int32')
    inp_segment_ids = Input(shape=(None, ),
                            name='input_segment_ids',
                            dtype='int32')
    inp_header_ids = Input(shape=(None, ),
                           name='input_header_ids',
                           dtype='int32')
    inp_header_mask = Input(shape=(None, ), name='input_header_mask')

    x = bert_model([inp_token_ids, inp_segment_ids])  # (None, seq_len, 768)

    x_for_cond_conn_op = Lambda(lambda x: x[:, 0])(x)  # (None, 768)
    p_cond_conn_op = Dense(num_cond_conn_op,
                           activation='softmax',
                           name='output_cond_conn_op')(x_for_cond_conn_op)

    x_for_header = Lambda(seq_gather,
                          name='header_seq_gather')([x, inp_header_ids
                                                     ])  # (None, h_len, 768)
    header_mask = Lambda(lambda x: K.expand_dims(x, axis=-1))(
        inp_header_mask)  # (None, h_len, 1)

    x_for_header = Multiply()([x_for_header, header_mask])
    x_for_header = Masking()(x_for_header)

    p_sel_agg = Dense(num_sel_agg, activation='softmax',
                      name='output_sel_agg')(x_for_header)

    x_for_cond_op = Concatenate(axis=-1)([x_for_header, p_sel_agg])
    p_cond_op = Dense(num_cond_op, activation='softmax',
                      name='output_cond_op')(x_for_cond_op)

    label_encoder = SqlLabelEncoder()

    model = Model(
        [inp_token_ids, inp_segment_ids, inp_header_ids, inp_header_mask],
        [p_cond_conn_op, p_sel_agg, p_cond_op])

    NUM_GPUS = 2
    learning_rate = 1e-5

    if use_multi_gpus:
        print('using {} gpus'.format(NUM_GPUS))
        model = multi_gpu_model(model, gpus=NUM_GPUS)

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=RAdam(lr=learning_rate))

    return model, query_tokenizer