예제 #1
0
def prepare_pretrained():
    convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
        BERT_BASE_DIR + '/bert_model.ckpt',
        BERT_BASE_DIR + '/bert_config.json', WORK_DIR + 'pytorch_model.bin')

    shutil.copyfile(BERT_BASE_DIR + '/bert_config.json',
                    WORK_DIR + 'bert_config.json')
예제 #2
0
def setting(BERT_MODEL_PATH, WORK_DIR):
    convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
        BERT_MODEL_PATH + 'bert_model.ckpt',
        BERT_MODEL_PATH + 'bert_config.json',
        WORK_DIR + 'pytorch_model.bin')

    shutil.copyfile(BERT_MODEL_PATH + 'bert_config.json', WORK_DIR + 'bert_config.json')
예제 #3
0
def convert_tf_to_pytorch(model_path, output_path):
    os.makedirs(output_path, exist_ok=True)
    convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
        os.path.join(model_path, 'bert_model.ckpt'),
        os.path.join(model_path, 'bert_config.json'),
        os.path.join(output_path, 'pytorch_model.bin'))

    shutil.copyfile(os.path.join(model_path, 'bert_config.json'),
                    os.path.join(output_path, 'bert_config.json'))
def setup_bert_model(path_to_pretrained_model,
                     epochs,
                     lrate,
                     lrate_clf,
                     batch_size,
                     accum_steps,
                     lin_dim,
                     lin_dropout_prob,
                     warmup,
                     apex_mixed_precision,
                     seed,
                     device,
                     train_loader,
                     clf_class=BertForSequencePairClassification):
    """

    :param path_to_pretrained_model:     Path to a folder with pretrained BERT model
    :param epochs:                       ...
    :param lrate:                        ...
    :param lrate_clf:                        ...
    :param batch_size:                   ...
    :param accum_steps:                  ...
    :param lin_dim:
    :param lin_dropout_prob:
    :param warmup:                       Percent of iterations to perform warmup
    :param apex_mixed_precision:         Whether to use nvidia apex mixed-precision training
    :param seed:                         ...
    :param device:                       ...
    :param train_loader:                 ...
    :param clf_class:                    ...
    :return: model, optimizer            PyTorch model and optimizer
    """

    path_to_pretrained_model = Path(path_to_pretrained_model)

    convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
        str(path_to_pretrained_model / 'bert_model.ckpt'),
        str(path_to_pretrained_model / 'bert_config.json'),
        str(path_to_pretrained_model / 'pytorch_model.bin'))

    seed_everything(seed)

    model = clf_class.from_pretrained(path_to_pretrained_model,
                                      lin_dim=lin_dim,
                                      lin_dropout_prob=lin_dropout_prob,
                                      cache_dir=None,
                                      num_labels=1)
    model.zero_grad()

    model = model.to(device)

    return setup_bert_optimizer_for_model(model, epochs, lrate, lrate_clf,
                                          batch_size, accum_steps, warmup,
                                          apex_mixed_precision, train_loader)
예제 #5
0
    def __call__(self, base_dir, output_dir):
        config_file = os.path.join(base_dir, 'bert_config.json')
        vocab_file = os.path.join(base_dir, 'vocab.txt')

        config_file_dest = os.path.join(output_dir, 'bert_config.json')
        vocab_file_dest = os.path.join(output_dir, 'vocab.txt')

        # Convert
        convert_tf_checkpoint_to_pytorch(
            os.path.join(base_dir, 'model.ckpt-1000000'), config_file,
            os.path.join(output_dir, 'pytorch_model.bin'))

        # copy config file
        copyfile(config_file, config_file_dest)

        # copy vocab file
        copyfile(vocab_file, vocab_file_dest)
num_to_load = 1700000  #Train size to match time limit
valid_size = 100000  #Validation Size
TOXICITY_COLUMN = 'target'

# Add the Bart Pytorch repo to the PATH
# using files from: https://github.com/huggingface/pytorch-pretrained-BERT
package_dir_a = "../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT"
sys.path.insert(0, package_dir_a)

from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam

# Translate model from tensorflow to pytorch
BERT_MODEL_PATH = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'
convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
    BERT_MODEL_PATH + 'bert_model.ckpt', BERT_MODEL_PATH + 'bert_config.json',
    WORK_DIR + 'pytorch_model.bin')

shutil.copyfile(BERT_MODEL_PATH + 'bert_config.json',
                WORK_DIR + 'bert_config.json')

os.listdir("../working")

# This is the Bert configuration file
from pytorch_pretrained_bert import BertConfig

bert_config = BertConfig(
    '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'
    + 'bert_config.json')

예제 #7
0
 def train(self):
     if self.debug_mode: self.epochs = 1
     # 加载 dataloader
     train_loader, valid_loader = self.create_dataloader()
     # 训练
     self.seed_everything()
     lr = 2e-5
     accumulation_steps = math.ceil(self.batch_size / self.base_batch_size)
     # 预训练 bert 转成 pytorch
     if os.path.exists(self.bert_model_path + "pytorch_model.bin") is False:
         convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
             self.bert_model_path + 'bert_model.ckpt',
             self.bert_model_path + 'bert_config.json',
             self.bert_model_path + 'pytorch_model.bin')
     # 加载预训练模型
     model = BertNeuralNet.from_pretrained(self.bert_model_path,
                                           cache_dir=None)
     model.zero_grad()
     model = model.to(self.device)
     # 不同的参数组设置不同的 weight_decay
     param_optimizer = list(model.named_parameters())
     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [{
         'params': [
             p for n, p in param_optimizer
             if not any(nd in n for nd in no_decay)
         ],
         'weight_decay':
         0.01
     }, {
         'params':
         [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay':
         0.0
     }]
     num_train_optimization_steps = int(self.epochs * self.train_len /
                                        self.base_batch_size /
                                        accumulation_steps)
     optimizer = BertAdam(optimizer_grouped_parameters,
                          lr=lr,
                          warmup=0.05,
                          t_total=num_train_optimization_steps)
     # 渐变学习速率
     #scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
     model, optimizer = amp.initialize(model,
                                       optimizer,
                                       opt_level="O1",
                                       verbosity=0)
     # 开始训练
     for epoch in range(self.epochs):
         start_time = time.time()
         model.train()
         optimizer.zero_grad()
         # 加载每个 batch 并训练
         for i, batch_data in enumerate(train_loader):
             x_batch = batch_data[0]
             y_batch = batch_data[1]
             target_weight_batch = batch_data[2]
             aux_weight_batch = batch_data[3]
             identity_weight_batch = batch_data[4]
             y_pred = model(x_batch.to(self.device),
                            attention_mask=(x_batch > 0).to(self.device),
                            labels=None)
             target_loss, aux_loss, identity_loss = self.custom_loss(
                 y_pred, y_batch, epoch, target_weight_batch,
                 aux_weight_batch, identity_weight_batch)
             loss = target_loss + aux_loss + identity_loss
             with amp.scale_loss(loss, optimizer) as scaled_loss:
                 scaled_loss.backward()
             if (i + 1) % accumulation_steps == 0:
                 optimizer.step()
                 optimizer.zero_grad()
         # 计算验证集
         model.eval()
         y_pred = np.zeros((len(self.train_df) - self.train_len))
         for i, batch_data in enumerate(valid_loader):
             x_batch = batch_data[0]
             batch_y_pred = self.sigmoid(
                 model(x_batch.to(self.device),
                       attention_mask=(x_batch > 0).to(self.device),
                       labels=None).detach().cpu().numpy())[:, 0]
             y_pred[i * self.base_batch_size:(i + 1) *
                    self.base_batch_size] = batch_y_pred
         # 计算得分
         auc_score = self.evaluator.get_final_metric(y_pred)
         print("epoch: %d duration: %d min auc_score: %.4f" %
               (epoch, int((time.time() - start_time) / 60), auc_score))
         if not self.debug_mode:
             state_dict = model.state_dict()
             torch.save(
                 state_dict,
                 os.path.join(
                     self.data_dir, "model/model_%d_%s_%d_%.5f" %
                     (self.seed, self.model_name, epoch, auc_score)))
     # del 训练相关输入和模型
     training_history = [
         train_loader, valid_loader, model, optimizer, param_optimizer,
         optimizer_grouped_parameters
     ]
     for variable in training_history:
         del variable
     gc.collect()
예제 #8
0
 def __init__(self,
              data_dir,
              model_name,
              epochs=4,
              batch_size=64,
              part=1.,
              seed=1234,
              debug_mode=False):
     self.device = torch.device('cuda')
     self.data_dir = "../input/jigsaw-unintended-bias-in-toxicity-classification"
     self.bert_model_path = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'
     self.input_dir = "../input"
     self.word_dir = "../working/"
     self.debug_mode = debug_mode
     self.model_name = model_name
     self.seed = seed
     self.identity_list = [
         'male', 'female', 'homosexual_gay_or_lesbian', 'christian',
         'jewish', 'muslim', 'black', 'white',
         'psychiatric_or_mental_illness'
     ]
     self.toxicity_type_list = [
         'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat'
     ]
     self.weight_dict = {
         "severe_toxicity": 1000,
         "obscene": 234,
         "identity_attack": 235,
         "insult": 21,
         "threat": 645,
         "male": 44,
         "female": 34,
         "homosexual_gay_or_lesbian": 175,
         "christian": 49,
         "jewish": 248,
         "muslim": 90,
         "black": 129,
         "white": 74,
         "psychiatric_or_mental_illness": 441,
         "np": 12,
         "pn": 15
     }
     self.stopwords = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'
     self.seed_everything()
     self.max_len = 220
     self.epochs = epochs
     self.base_batch_size = 32
     self.batch_size = batch_size
     self.split_ratio = 0.95
     self.sample_num = 1804874
     if not self.debug_mode:
         self.train_df = pd.read_csv(
             os.path.join(self.data_dir,
                          "train.csv")).sample(int(self.sample_num * part),
                                               random_state=1234).fillna(0.)
         self.test_df = pd.read_csv(os.path.join(self.data_dir, "test.csv"))
     else:
         self.train_df = pd.read_csv(
             os.path.join(self.data_dir, "train.csv")).head(1000).fillna(0.)
         self.test_df = pd.read_csv(os.path.join(self.data_dir,
                                                 "test.csv")).head(1000)
     self.train_len = int(len(self.train_df) * self.split_ratio)
     self.evaluator = self.init_evaluator()
     self.bert_config = BertConfig(
         os.path.join(self.data_dir,
                      "uncased_L-12_H-768_A-12/bert_config.json"))
     convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
         self.bert_model_path + 'bert_model.ckpt',
         self.bert_model_path + 'bert_config.json',
         self.word_dir + 'pytorch_model.bin')
     shutil.copyfile(self.bert_model_path + 'bert_config.json',
                     self.word_dir + 'bert_config.json')
cache_dir = '/home/dpappas/bert_cache/'
if (not os.path.exists(cache_dir)):
    os.makedirs(cache_dir)

tokenizer = BertTokenizer.from_pretrained(
    # pretrained_model_name='bert-large-uncased',
    pretrained_model_name=
    '/home/dpappas/bert_cache/bert-large-uncased-vocab.txt',
    cache_dir=cache_dir)

(test_data, test_docs, dev_data, dev_docs, train_data, train_docs,
 bioasq6_data) = load_all_data(dataloc=dataloc)

if (not os.path.exists(os.path.join(init_checkpoint_pt, 'pytorch_model.bin'))):
    convert_tf_checkpoint_to_pytorch(
        os.path.join(init_checkpoint_pt, 'bert_model.ckpt'),
        os.path.join(init_checkpoint_pt, 'bert_config.json'),
        os.path.join(init_checkpoint_pt, 'pytorch_model.bin'))

elmo = Elmo(options_file, weight_file, 1, dropout=0)
model = BertModel.from_pretrained(init_checkpoint_pt, cache_dir=cache_dir)
# model               = model.cuda()
model.eval()

nof_threads = 16

#######################################################

# test_data['queries']    = Parallel(n_jobs=nof_threads, verbose=0, backend="threading")(map(delayed(work), tqdm(test_data['queries'])))
# with open(dataloc + 'bioasq_bm25_top100_bert_elmo.test.pkl', 'wb') as f:
#     pickle.dump(test_data, f)
#     f.close()
예제 #10
0
 def train(self):
     if self.debug_mode: self.epochs = 1
     # 加载 dataloader
     train_loader, valid_loader = self.create_dataloader()
     # 训练
     self.seed_everything()
     lr = 7e-6
     accumulation_steps = math.ceil(self.batch_size / self.base_batch_size)
     # 预训练 bert 转成 pytorch
     if os.path.exists(self.bert_model_path + "pytorch_model.bin") is False:
         convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
             self.bert_model_path + 'bert_model.ckpt',
             self.bert_model_path + 'bert_config.json',
             self.bert_model_path + 'pytorch_model.bin')
     # 加载预训练模型
     model = BertNeuralNet.from_pretrained(self.bert_model_path,
                                           cache_dir=None)
     #model.load_state_dict(torch.load("/root/nb/data/model/model[bert][1234][2][17][train2_simple_target][0.9419].bin"))
     model.load_state_dict(
         torch.load(
             "/root/nb/data/model/model[bert][1234][1][20][train2_simple_target][0.9395].bin"
         ))
     model.zero_grad()
     model = model.to(self.device)
     # 不同的参数组设置不同的 weight_decay
     param_optimizer = list(model.named_parameters())
     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [{
         'params': [
             p for n, p in param_optimizer
             if not any(nd in n for nd in no_decay)
         ],
         'weight_decay':
         0.01
     }, {
         'params':
         [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay':
         0.0
     }]
     epoch_steps = int(self.train_len / self.base_batch_size /
                       accumulation_steps)
     num_train_optimization_steps = int(self.epochs * epoch_steps)
     valid_every = math.floor(epoch_steps / 10)
     optimizer = BertAdam(optimizer_grouped_parameters,
                          lr=lr,
                          warmup=0.05,
                          t_total=num_train_optimization_steps)
     # 渐变学习速率
     #scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
     model, optimizer = amp.initialize(model,
                                       optimizer,
                                       opt_level="O1",
                                       verbosity=0)
     # 开始训练
     for epoch in range(self.epochs):
         train_start_time = time.time()
         model.train()
         optimizer.zero_grad()
         # 加载每个 batch 并训练
         for i, batch_data in enumerate(train_loader):
             x_batch = batch_data[0]
             y_batch = batch_data[1]
             target_weight_batch = batch_data[2]
             aux_weight_batch = batch_data[3]
             identity_weight_batch = batch_data[4]
             x_mask = batch_data[5]
             y_pred = model(x_batch, attention_mask=x_mask, labels=None)
             target_loss = self.custom_loss(y_pred, y_batch, epoch,
                                            target_weight_batch,
                                            aux_weight_batch,
                                            identity_weight_batch)
             loss = target_loss
             with amp.scale_loss(loss, optimizer) as scaled_loss:
                 scaled_loss.backward()
             if (i + 1) % accumulation_steps == 0:
                 optimizer.step()
                 optimizer.zero_grad()
             # 验证
             if (i + 1) % valid_every == 0:
                 valid_start_time = time.time()
                 model.eval()
                 y_pred = np.zeros((len(self.train_df) - self.train_len))
                 for j, valid_batch_data in enumerate(valid_loader):
                     x_batch = valid_batch_data[0]
                     x_mask = valid_batch_data[2]
                     batch_y_pred = self.sigmoid(
                         model(x_batch, attention_mask=x_mask,
                               labels=None).detach().cpu().numpy())[:, 0]
                     y_pred[j * self.base_batch_size:(j + 1) *
                            self.base_batch_size] = batch_y_pred
                 # 计算得分
                 auc_score = self.evaluator.get_final_metric(y_pred)
                 print(
                     "epoch: %d duration: %d min auc_score: %.4f" %
                     (epoch, int(
                         (time.time() - train_start_time) / 60), auc_score))
                 if not self.debug_mode:
                     state_dict = model.state_dict()
                     stage = int((i + 1) / valid_every)
                     train_duration = int(
                         (time.time() - train_start_time) / 60)
                     valid_duration = int(
                         (time.time() - valid_start_time) / 60)
                     if epoch == 0 and stage == 1:
                         # model[bert][seed][epoch][stage][model_name][stage_train_duration][valid_duration][score].bin
                         model_name = "model2/model_bert_%d_%d_%d_%s_%dmin_%dmin_%.4f.bin" % (
                             self.seed, epoch + 1, stage, self.model_name,
                             train_duration, valid_duration, auc_score)
                     else:
                         # model[bert][seed][epoch][stage][model_name][score].bin
                         model_name = "model2/model_bert_%d_%d_%d_%s_%.4f.bin" % (
                             self.seed, epoch + 1, stage, self.model_name,
                             auc_score)
                     torch.save(state_dict,
                                os.path.join(self.data_dir, model_name))
                 model.train()
     # del 训练相关输入和模型
     training_history = [
         train_loader, valid_loader, model, optimizer, param_optimizer,
         optimizer_grouped_parameters
     ]
     for variable in training_history:
         del variable
     gc.collect()
예제 #11
0
import os
from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch

model_path = "/root/nb/data/nl2sql_data/chinese_L-12_H-768_A-12/"

if os.path.exists(model_path + "pytorch_model.bin") is False:
    convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
        model_path + 'bert_model.ckpt', model_path + 'bert_config.json',
        model_path + 'pytorch_model.bin')
"""
get /root/nb/data/nl2sql_data/chinese_L-12_H-768_A-12/pytorch_model.bin /Users/hedongfeng/Desktop/
put /Users/hedongfeng/PycharmProjects/unintended_bias/data/nl2sql_data.zip  /root/nb/data/
"""
MAX_SEQUENCE_LENGTH = 300
SEED = 1234
EPOCHS = 10
data_dir = './data/text_data.csv'
working_dir = './'
valid_size = 100000
num_to_load = 100000
target_column = 'target'

from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam

BERT_MODEL_PATH = './bert_model/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'
convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
    BERT_MODEL_PATH + 'bert_model.ckpt', BERT_MODEL_PATH + 'bert_config.json',
    working_dir + 'pytorch_model.bin')

shutil.copyfile(BERT_MODEL_PATH + 'bert_config.json',
                working_dir + 'bert_config.json')

#bert config file
from pytorch_pretrained_bert import BertConfig

bert_config = BertConfig(
    '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'
    + 'bert_config.json')


#convert text to bert format
def convert_lines(example, max_seq_length, tokenizer):
예제 #13
0
import torch
from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

BERT_MODEL_PATH = '../models/rubert_cased_deeppavlov/'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=False)

convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
    BERT_MODEL_PATH + 'bert_model.ckpt',
    BERT_MODEL_PATH + 'bert_config.json',
    '../models/rubert_cased_torch/pytorch_model.bin')
import os
from pytorch_pretrained_bert.convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch

# in_dir          = '/media/dpappas/dpappas_data/biobert/pubmed_pmc_470k/'
in_dir          = '/media/dpappas/dpappas_data/biobert/biobert_pubmed/'
out_bin_path    = os.path.join(in_dir, 'pytorch_model.bin')

if (not os.path.exists(out_bin_path)):
    convert_tf_checkpoint_to_pytorch(os.path.join(in_dir, 'biobert_model.ckpt'), os.path.join(in_dir, 'bert_config.json'), out_bin_path)


예제 #15
0
BERT_MODEL_PATH = Path('uncased_L-12_H-768_A-12/')
#BERT_MODEL_PATH = Path('/content/BERT')

!pip install pytorch-pretrained-bert

# Add the Bert Pytorch repo to the PATH using files from: https://github.com/huggingface/pytorch-pretrained-BERT
package_dir_a = "../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT"
sys.path.insert(0, package_dir_a)

from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam, BertConfig

# Translate model from tensorflow to pytorch
convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
    str(BERT_MODEL_PATH / 'bert_model.ckpt'),
    str(BERT_MODEL_PATH / 'bert_config.json'),
    str(WORK_DIR / 'pytorch_model.bin')
)

shutil.copyfile(BERT_MODEL_PATH / 'bert_config.json', WORK_DIR / 'bert_config.json')
bert_config = BertConfig(str(BERT_MODEL_PATH / 'bert_config.json'))

# Converting the lines to BERT format
def convert_lines(example, max_seq_length, tokenizer):
    max_seq_length -= 2
    all_tokens = []
    longer = 0
    for text in tqdm_notebook(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a) > max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
예제 #16
0
#encoding:utf-8
import os
from pybert.config.basic_config import configs as config
from pytorch_pretrained_bert.convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch

if __name__ == "__main__":
    os.system('cp {config} {save_path}'.format(config = config['pretrained']['bert']['bert_config_file'],
                                               save_path =config['pretrained']['bert']['bert_model_dir']))
    convert_tf_checkpoint_to_pytorch(config['pretrained']['bert']['tf_checkpoint_path'],
                                     config['pretrained']['bert']['bert_config_file'],
                                     config['pretrained']['bert']['pytorch_model_path'])




# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert BERT checkpoint."""
#
예제 #17
0
    tokenizer = BertTokenizer.from_pretrained(
        args['bert_model'], do_lower_case=args['do_lower_case'])

    train_examples = None
    num_train_steps = None

    if args['do_train']:
        train_examples = processor.get_train_examples(args['full_data_dir'],
                                                      size=args['train_size'])
        #     train_examples = processor.get_train_examples(args['data_dir'], size=args['train_size'])
        num_train_steps = int(
            len(train_examples) / args['train_batch_size'] /
            args['gradient_accumulation_steps'] * args['num_train_epochs'])

    convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch('/scratch/cc5048/bert/uncased_L-12_H-768_A-12/bert_model.ckpt', '/scratch/cc5048/bert/uncased_L-12_H-768_A-12/bert_config.json',  \
     '/scratch/cc5048/bert/uncased_L-12_H-768_A-12/pytorch_model.bin')

    # Prepare model
    def get_model():
        #     pdb.set_trace()
        if model_state_dict:
            model = BertForMultiLabelSequenceClassification.from_pretrained(
                args['bert_model'],
                num_labels=num_labels,
                state_dict=model_state_dict)
        else:
            model = BertForMultiLabelSequenceClassification.from_pretrained(
                args['bert_model'], num_labels=num_labels)
        return model

    model = get_model()
import torch
import os, sys
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification

if sys.argv[2] == 'cased':
    VOCAB = './bert_cased/vocab.txt'
    MODEL = './bert_cased'
else:
    VOCAB = './bert_uncased/vocab.txt'
    MODEL = './bert_uncased'

#convert model to pytorch_model
if os.path.exists(MODEL + '/pytorch_model.bin') == False:
    from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
    convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
        MODEL + '/bert_model.ckpt', MODEL + '/bert_config.json',
        MODEL + '/pytorch_model.bin')

#load data
train_data = pd.read_csv('./project2_data/olid-training-v1.0.tsv',
                         sep='\t',
                         index_col='id')
if sys.argv[1] == 'A':
    test = pd.read_csv('./project2_data/testset-levela.tsv',
                       sep='\t',
                       index_col='id')
    train = pd.DataFrame({
        'tweet': train_data['tweet'],
        'label': train_data['subtask_a']
    })
elif sys.argv[1] == 'B':
예제 #19
0
 def train(self):
     if self.debug_mode: self.epochs = 1
     # 加载 dataloader
     train_loader, valid_loader = self.create_dataloader()
     # 训练
     self.seed_everything()
     lr = 2e-5
     accumulation_steps = math.ceil(self.batch_size / self.base_batch_size)
     # 预训练 bert 转成 pytorch
     if os.path.exists(self.work_dir + 'pytorch_model.bin') is False:
         print("Convert pre-trained model")
         convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
             self.bert_model_path + 'bert_model.ckpt',
             self.bert_model_path + 'bert_config.json',
             self.work_dir + 'pytorch_model.bin')
     shutil.copyfile(self.bert_model_path + 'bert_config.json', self.work_dir + 'bert_config.json')
     # 加载预训练模型
     print("Load pre-trained model")
     model = BertNeuralNet.from_pretrained(self.work_dir, cache_dir=None)
     model.zero_grad()
     model = model.to(self.device)
     # 不同的参数组设置不同的 weight_decay
     param_optimizer = list(model.named_parameters())
     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [
         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
     ]
     epoch_steps = int(self.train_len * 0.5 / self.base_batch_size / accumulation_steps)
     num_train_optimization_steps = int(self.epochs * epoch_steps)
     valid_every = math.floor(epoch_steps * accumulation_steps / 5)
     optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=0.05, t_total=num_train_optimization_steps)
     # 渐变学习速率
     #scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
     model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)
     # 开始训练
     print("Train")
     best_auc_score_1 = 0
     best_auc_score_2 = 0
     best_auc_score_3 = 0
     best_auc_score_4 = 0
     f_log = open("train_log.txt", "w")
     for epoch in range(self.epochs):
         model.train()
         optimizer.zero_grad()
         # 加载每个 batch 并训练
         train_start_time = time.time()
         for i, batch_data in enumerate(train_loader):
             x_batch = batch_data[0]
             y_batch = batch_data[1]
             target_weight_batch = batch_data[2]
             aux_weight_batch = batch_data[3]
             identity_weight_batch = batch_data[4]
             np_weight_batch = batch_data[5]
             np_identity_weight_batch = batch_data[6]
             y_pred = model(x_batch.to(self.device), attention_mask=(x_batch > 0).to(self.device), labels=None)
             target_loss, aux_loss, identity_loss, np_loss = self.custom_loss(y_pred, y_batch, epoch, target_weight_batch, aux_weight_batch, identity_weight_batch, np_weight_batch)
             loss = target_loss + aux_loss + identity_loss + np_loss
             with amp.scale_loss(loss, optimizer) as scaled_loss:
                 scaled_loss.backward()
             if (i + 1) % accumulation_steps == 0:
                 optimizer.step()
                 optimizer.zero_grad()
             # 验证
             if (i + 1) % valid_every == 0:
                 model.eval()
                 stage = int((i + 1) / valid_every)
                 train_stage_duration = int((time.time() - train_start_time) / 60)
                 valid_start_time = time.time()
                 y_pred = np.zeros((len(self.train_df) - self.train_len))
                 for j, valid_batch_data in enumerate(valid_loader):
                     x_batch = valid_batch_data[0]
                     batch_y_pred = self.sigmoid(model(x_batch.to(self.device), attention_mask=(x_batch > 0).to(self.device), labels=None).detach().cpu().numpy())[:, 0]
                     y_pred[j * self.base_batch_size: (j + 1) * self.base_batch_size] = batch_y_pred
                 # 计算得分
                 auc_score = self.evaluator.get_final_metric(y_pred)
                 valid_duration = int((time.time() - valid_start_time) / 60)
                 train_start_time = time.time()
                 f_log.write("epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f\n" % (epoch, stage, train_stage_duration, valid_duration, auc_score))
                 print("epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f" % (epoch, stage, train_stage_duration, valid_duration, auc_score))
                 if auc_score > best_auc_score_4:
                     state_dict = model.state_dict()
                     if auc_score > best_auc_score_1:
                         best_auc_score_1 = auc_score
                         torch.save(state_dict, "model1.bin")
                     elif auc_score > best_auc_score_2:
                         best_auc_score_2 = auc_score
                         torch.save(state_dict, "model2.bin")
                     elif auc_score > best_auc_score_3:
                         best_auc_score_3 = auc_score
                         torch.save(state_dict, "model3.bin")
                     else:
                         best_auc_score_4 = auc_score
                         torch.save(state_dict, "model4.bin")
                     with open("model_score.txt", "w") as f:
                         f.write("model1: %.4f model2: %.4f model3: %.4f model4: %.4f" % (best_auc_score_1, best_auc_score_2, best_auc_score_3, best_auc_score_4))
                     print("model1: %.4f model2: %.4f model3: %.4f model4: %.4f" % (best_auc_score_1, best_auc_score_2, best_auc_score_3, best_auc_score_4))
                 model.train()
         if self.last is True:
             state_dict = model.state_dict()
             torch.save(state_dict, "model_last.bin")
     # del 训练相关输入和模型
     training_history = [train_loader, valid_loader, model, optimizer, param_optimizer, optimizer_grouped_parameters]
     for variable in training_history:
         del variable
     gc.collect()
#!/usr/bin/python
# coding: utf-8

import shutil

from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch

BERT_MODEL_PATH = "models/chinese_L-12_H-768_A-12/"

if __name__ == "__main__":
    convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
        BERT_MODEL_PATH + "bert_model.ckpt",
        BERT_MODEL_PATH + "bert_config.json",
        "models/pytorch_pretrain/pytorch_model.bin",
    )
    shutil.copyfile(
        BERT_MODEL_PATH + "bert_config.json",
        "models/pytorch_pretrain/bert_config.json",
    )
    shutil.copyfile(BERT_MODEL_PATH + "vocab.txt",
                    "models/pytorch_pretrain/vocab.txt")