def train_bert(config: PipeLineConfig):
    logging.basicConfig(level=logging.INFO)

    logging.info("Reading data...")
    input_folder = "../input/jigsaw-unintended-bias-in-toxicity-classification/"
    train = pd.read_csv(os.path.join(input_folder, "train.csv"))

    logging.info("Tokenizing...")

    with multiprocessing.Pool(processes=32) as pool:
        text_list = train.comment_text.tolist()
        sequences = pool.map(convert_line_uncased, text_list)

    logging.info("Building ttensors for training...")
    sequences = np.array(sequences)
    lengths = np.argmax(sequences == 0, axis=1)
    lengths[lengths == 0] = sequences.shape[1]

    logging.info("Bulding target tesnor...")
    iden = train[IDENTITY_COLUMNS].fillna(0).values
    subgroup_target = np.hstack(
        [
            (iden >= 0.5).any(axis=1, keepdims=True).astype(np.int),
            iden,
            iden.max(axis=1, keepdims=True),
        ]
    )
    sub_target_weigths = (
        ~train[IDENTITY_COLUMNS].isna().values.any(axis=1, keepdims=True)
    ).astype(np.int)

    weights = np.ones(len(train))
    weights += (iden >= 0.5).any(1)
    weights += (train["target"].values >= 0.5) & (iden < 0.5).any(1)
    weights += (train["target"].values < 0.5) & (iden >= 0.5).any(1)
    weights /= weights.mean()

    y_aux_train = train[AUX_TARGETS]
    y_train_torch = torch.tensor(
        np.hstack(
            [
                train.target.values[:, None],
                weights[:, None],
                y_aux_train,
                subgroup_target,
                sub_target_weigths,
            ]
        )
    ).float()

    perfect_output = torch.tensor(
        np.hstack([train.target.values[:, None], y_aux_train, subgroup_target])
    ).float()

    logging.info("Seeding with seed %d ...", config.seed)
    seed_everything(config.seed)

    logging.info("Creating dataset...")
    dataset = data.TensorDataset(
        torch.from_numpy(sequences).long(), y_train_torch, torch.from_numpy(lengths)
    )
    train_loader = data.DataLoader(
        dataset, batch_size=BATCH_SIZE, collate_fn=clip_to_max_len, shuffle=True
    )

    logging.info("Creating a model...")
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=18
    )
    model.zero_grad()
    model = model.cuda()
    model.classifier.bias = nn.Parameter(perfect_bias(perfect_output.mean(0)).cuda())

    logs_file = f"./tb_logs/final_{config.expname}"
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if should_decay(n)],
            "weight_decay": config.decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if not should_decay(n)],
            "weight_decay": 0.00,
        },
    ]

    optimizer = BertAdam(
        optimizer_grouped_parameters,
        lr=config.lr,
        warmup=config.warmup,
        t_total=config.epochs * len(train_loader) // ACCUM_STEPS,
    )

    model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)
    model = model.train()

    writer = SummaryWriter(logs_file)
    agg = TensorboardAggregator(writer)
    custom_loss = prepare_loss(config)

    for _ in range(config.epochs):
        for j, (X, y) in enumerate(train_loader):

            X = X.cuda()
            y = y.cuda()

            y_pred = model(X, attention_mask=(X > 0))
            loss = custom_loss(y_pred, y)

            accuracy = ((y_pred[:, 0] > 0) == (y[:, 0] > 0.5)).float().mean()
            agg.log({"train_loss": loss.item(), "train_accuracy": accuracy.item()})

            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()

            if (j + 1) % ACCUM_STEPS == 0:
                optimizer.step()
                optimizer.zero_grad()

    torch.save(model.state_dict(), f"./models/final-pipe2-{config.expname}.bin")
예제 #2
0
 def train(self):
     if self.debug_mode: self.epochs = 1
     # 加载 dataloader
     train_loader, valid_loader = self.create_dataloader()
     # 训练
     self.seed_everything()
     lr = 2e-5
     accumulation_steps = math.ceil(self.batch_size / self.base_batch_size)
     # 预训练 bert 转成 pytorch
     if os.path.exists(self.work_dir + 'pytorch_model.bin') is False:
         print("Convert pre-trained model")
         convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
             self.bert_model_path + 'bert_model.ckpt',
             self.bert_model_path + 'bert_config.json',
             self.work_dir + 'pytorch_model.bin')
     shutil.copyfile(self.bert_model_path + 'bert_config.json',
                     self.work_dir + 'bert_config.json')
     # 加载预训练模型
     print("Load pre-trained model")
     model = BertNeuralNet.from_pretrained(self.work_dir, cache_dir=None)
     model.zero_grad()
     model = model.to(self.device)
     # 不同的参数组设置不同的 weight_decay
     param_optimizer = list(model.named_parameters())
     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [{
         'params': [
             p for n, p in param_optimizer
             if not any(nd in n for nd in no_decay)
         ],
         'weight_decay':
         0.01
     }, {
         'params':
         [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay':
         0.0
     }]
     epoch_steps = int(self.train_len * 0.5 / self.base_batch_size /
                       accumulation_steps)
     num_train_optimization_steps = int(self.epochs * epoch_steps)
     valid_every = math.floor(epoch_steps * accumulation_steps / 5)
     optimizer = BertAdam(optimizer_grouped_parameters,
                          lr=lr,
                          warmup=0.05,
                          t_total=num_train_optimization_steps)
     # 渐变学习速率
     #scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
     model, optimizer = amp.initialize(model,
                                       optimizer,
                                       opt_level="O1",
                                       verbosity=0)
     # 开始训练
     print("Train")
     best_auc_score_1 = 0
     best_auc_score_2 = 0
     best_auc_score_3 = 0
     best_auc_score_4 = 0
     f_log = open("train_log.txt", "w")
     for epoch in range(self.epochs):
         model.train()
         optimizer.zero_grad()
         # 加载每个 batch 并训练
         train_start_time = time.time()
         for i, batch_data in enumerate(train_loader):
             x_batch = batch_data[0]
             y_batch = batch_data[1]
             target_weight_batch = batch_data[2]
             aux_weight_batch = batch_data[3]
             identity_weight_batch = batch_data[4]
             np_weight_batch = batch_data[5]
             np_identity_weight_batch = batch_data[6]
             y_pred = model(x_batch.to(self.device),
                            attention_mask=(x_batch > 0).to(self.device),
                            labels=None)
             target_loss, aux_loss, identity_loss, np_loss = self.custom_loss(
                 y_pred, y_batch, epoch, target_weight_batch,
                 aux_weight_batch, identity_weight_batch, np_weight_batch)
             loss = target_loss + aux_loss + identity_loss + np_loss
             with amp.scale_loss(loss, optimizer) as scaled_loss:
                 scaled_loss.backward()
             if (i + 1) % accumulation_steps == 0:
                 optimizer.step()
                 optimizer.zero_grad()
             # 验证
             if (i + 1) % valid_every == 0:
                 model.eval()
                 stage = int((i + 1) / valid_every)
                 train_stage_duration = int(
                     (time.time() - train_start_time) / 60)
                 valid_start_time = time.time()
                 y_pred = np.zeros((len(self.train_df) - self.train_len))
                 for j, valid_batch_data in enumerate(valid_loader):
                     x_batch = valid_batch_data[0]
                     batch_y_pred = self.sigmoid(
                         model(x_batch.to(self.device),
                               attention_mask=(x_batch > 0).to(self.device),
                               labels=None).detach().cpu().numpy())[:, 0]
                     y_pred[j * self.base_batch_size:(j + 1) *
                            self.base_batch_size] = batch_y_pred
                 # 计算得分
                 auc_score = self.evaluator.get_final_metric(y_pred)
                 valid_duration = int((time.time() - valid_start_time) / 60)
                 train_start_time = time.time()
                 f_log.write(
                     "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f\n"
                     % (epoch, stage, train_stage_duration, valid_duration,
                        auc_score))
                 print(
                     "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f"
                     % (epoch, stage, train_stage_duration, valid_duration,
                        auc_score))
                 if auc_score > best_auc_score_4:
                     state_dict = model.state_dict()
                     if auc_score > best_auc_score_1:
                         best_auc_score_1 = auc_score
                         torch.save(state_dict, "model1.bin")
                     elif auc_score > best_auc_score_2:
                         best_auc_score_2 = auc_score
                         torch.save(state_dict, "model2.bin")
                     elif auc_score > best_auc_score_3:
                         best_auc_score_3 = auc_score
                         torch.save(state_dict, "model3.bin")
                     else:
                         best_auc_score_4 = auc_score
                         torch.save(state_dict, "model4.bin")
                     with open("model_score.txt", "w") as f:
                         f.write(
                             "model1: %.4f model2: %.4f model3: %.4f model4: %.4f"
                             % (best_auc_score_1, best_auc_score_2,
                                best_auc_score_3, best_auc_score_4))
                     print(
                         "model1: %.4f model2: %.4f model3: %.4f model4: %.4f"
                         % (best_auc_score_1, best_auc_score_2,
                            best_auc_score_3, best_auc_score_4))
                 model.train()
         if self.last is True:
             state_dict = model.state_dict()
             torch.save(state_dict, "model_last.bin")
     # del 训练相关输入和模型
     training_history = [
         train_loader, valid_loader, model, optimizer, param_optimizer,
         optimizer_grouped_parameters
     ]
     for variable in training_history:
         del variable
     gc.collect()
예제 #3
0
파일: train1.py 프로젝트: Dongfeng-He/nb
model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)
model=model.train()

tq = tqdm_notebook(range(EPOCHS))
for epoch in tq:
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    avg_loss = 0.
    avg_accuracy = 0.
    lossf=None
    tk0 = tqdm_notebook(enumerate(train_loader),total=len(train_loader),leave=False)
    optimizer.zero_grad()   # Bug fix - thanks to @chinhuic
    for i,(x_batch, y_batch) in tk0:
#        optimizer.zero_grad()
        y_pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
        loss =  F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device))
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
            optimizer.step()                            # Now we can do an optimizer step
            optimizer.zero_grad()
        if lossf:
            lossf = 0.98*lossf+0.02*loss.item()
        else:
            lossf = loss.item()
        tk0.set_postfix(loss = lossf)
        avg_loss += loss.item() / len(train_loader)
        avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:,0])>0.5) == (y_batch[:,0]>0.5).to(device)).to(torch.float) ).item()/len(train_loader)
    tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)


torch.save(model.state_dict(), output_model_file)
예제 #4
0
        #     break

        batch = tuple(t.to(device) for t in batch)
        X, S, X_MASK, X_SEG = batch
        pred_s = subject_model(X, X_SEG, X_MASK)

        active_loss = X_MASK.view(-1) == 1
        loss = loss_func(
            pred_s.view(-1, num_class)[active_loss],
            S.view(-1)[active_loss])
        if n_gpu > 1:
            loss = loss.mean()

        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        tr_total_loss += loss.item()
        if batch_idx % 100 == 0:
            logger.info(
                f'Epoch:{epoch} - batch:{batch_idx}/{train_D.steps} - loss: {tr_total_loss / batch_idx:.8f}'
            )

    subject_model.eval()
    A, B, C = 1e-10, 1e-10, 1e-10
    err_dict = defaultdict(list)
    cat_dict = defaultdict(lambda: 1e-10)
    for eval_idx, d in enumerate(dev_data):
        tt, ll = d
        R = extract_items(tt)
예제 #5
0
class MTDNNModel(object):
    def __init__(self, opt, state_dict=None, num_train_step=-1):
        self.config = opt
        self.updates = state_dict[
            'updates'] if state_dict and 'updates' in state_dict else 0
        self.train_loss = AverageMeter()
        self.network = SANBertNetwork(opt)

        if state_dict:
            new_state = set(self.network.state_dict().keys())
            for k in list(state_dict['state'].keys()):
                if k not in new_state:
                    del state_dict['state'][k]
            for k, v in list(self.network.state_dict().items()):
                if k not in state_dict['state']:
                    state_dict['state'][k] = v
            self.network.load_state_dict(state_dict['state'])
        self.mnetwork = nn.DataParallel(
            self.network) if opt['multi_gpu_on'] else self.network
        self.total_param = sum([
            p.nelement() for p in self.network.parameters() if p.requires_grad
        ])

        no_decay = [
            'bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight'
        ]
        optimizer_parameters = [{
            'params': [
                p for n, p in self.network.named_parameters()
                if n not in no_decay
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in self.network.named_parameters() if n in no_decay],
            'weight_decay_rate':
            0.0
        }]
        # note that adamax are modified based on the BERT code
        if opt['optimizer'] == 'sgd':
            self.optimizer = optim.sgd(optimizer_parameters,
                                       opt['learning_rate'],
                                       weight_decay=opt['weight_decay'])

        elif opt['optimizer'] == 'adamax':
            self.optimizer = Adamax(optimizer_parameters,
                                    opt['learning_rate'],
                                    warmup=opt['warmup'],
                                    t_total=num_train_step,
                                    max_grad_norm=opt['grad_clipping'],
                                    schedule=opt['warmup_schedule'])
            if opt.get('have_lr_scheduler', False):
                opt['have_lr_scheduler'] = False
        elif opt['optimizer'] == 'adadelta':
            self.optimizer = optim.Adadelta(optimizer_parameters,
                                            opt['learning_rate'],
                                            rho=0.95)
        elif opt['optimizer'] == 'adam':
            self.optimizer = Adam(optimizer_parameters,
                                  lr=opt['learning_rate'],
                                  warmup=opt['warmup'],
                                  t_total=num_train_step,
                                  max_grad_norm=opt['grad_clipping'],
                                  schedule=opt['warmup_schedule'])
            if opt.get('have_lr_scheduler', False):
                opt['have_lr_scheduler'] = False
        else:
            raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer'])

        if state_dict and 'optimizer' in state_dict:
            self.optimizer.load_state_dict(state_dict['optimizer'])

        if opt.get('have_lr_scheduler', False):
            if opt.get('scheduler_type', 'rop') == 'rop':
                self.scheduler = ReduceLROnPlateau(self.optimizer,
                                                   mode='max',
                                                   factor=opt['lr_gamma'],
                                                   patience=3)
            elif opt.get('scheduler_type', 'rop') == 'exp':
                self.scheduler = ExponentialLR(self.optimizer,
                                               gamma=opt.get('lr_gamma', 0.95))
            else:
                milestones = [
                    int(step)
                    for step in opt.get('multi_step_lr', '10,20,30').split(',')
                ]
                self.scheduler = MultiStepLR(self.optimizer,
                                             milestones=milestones,
                                             gamma=opt.get('lr_gamma'))
        else:
            self.scheduler = None
        self.ema = None
        if opt['ema_opt'] > 0:
            self.ema = EMA(self.config['ema_gamma'], self.network)
        self.para_swapped = False

    def setup_ema(self):
        if self.config['ema_opt']:
            self.ema.setup()

    def update_ema(self):
        if self.config['ema_opt']:
            self.ema.update()

    def eval(self):
        if self.config['ema_opt']:
            self.ema.swap_parameters()
            self.para_swapped = True

    def train(self):
        if self.para_swapped:
            self.ema.swap_parameters()
            self.para_swapped = False

    def update(self, batch_meta, batch_data):
        self.network.train()
        labels = batch_data[batch_meta['label']]
        if batch_meta['pairwise']:
            labels = labels.contiguous().view(-1,
                                              batch_meta['pairwise_size'])[:,
                                                                           0]
        if self.config['cuda']:
            y = Variable(labels.cuda(async=True), requires_grad=False)
        else:
            y = Variable(labels, requires_grad=False)
        task_id = batch_meta['task_id']
        task_type = batch_meta['task_type']
        inputs = batch_data[:batch_meta['input_len']]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        logits = self.mnetwork(*inputs)
        if batch_meta['pairwise']:
            logits = logits.view(-1, batch_meta['pairwise_size'])

        if self.config.get('weighted_on', False):
            if self.config['cuda']:
                weight = Variable(
                    batch_data[batch_meta['factor']].cuda(async=True))
            else:
                weight = Variable(batch_data[batch_meta['factor']])
            if task_type > 0:
                loss = torch.mean(
                    F.mse_loss(logits.squeeze(), y, reduce=False) * weight)
            else:
                loss = torch.mean(
                    F.cross_entropy(logits, y, reduce=False) * weight)
        else:
            if task_type > 0:
                loss = F.mse_loss(logits.squeeze(), y)
            else:
                loss = F.cross_entropy(logits, y)

        self.train_loss.update(loss.item(), logits.size(0))
        self.optimizer.zero_grad()

        loss.backward()
        if self.config['global_grad_clipping'] > 0:
            torch.nn.utils.clip_grad_norm_(self.network.parameters(),
                                           self.config['global_grad_clipping'])
        self.optimizer.step()
        self.updates += 1
        self.update_ema()

    def predict(self, batch_meta, batch_data):
        self.network.eval()
        task_id = batch_meta['task_id']
        task_type = batch_meta['task_type']
        inputs = batch_data[:batch_meta['input_len']]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        score = self.mnetwork(*inputs)
        if batch_meta['pairwise']:
            score = score.contiguous().view(-1, batch_meta['pairwise_size'])
            if task_type < 1:
                score = F.softmax(score, dim=1)
            score = score.data.cpu()
            score = score.numpy()
            predict = np.zeros(score.shape, dtype=int)
            positive = np.argmax(score, axis=1)
            for idx, pos in enumerate(positive):
                predict[idx, pos] = 1
            predict = predict.reshape(-1).tolist()
            score = score.reshape(-1).tolist()
            return score, predict, batch_meta['true_label']
        else:
            if task_type < 1:
                score = F.softmax(score, dim=1)
            score = score.data.cpu()
            score = score.numpy()
            predict = np.argmax(score, axis=1).tolist()
            score = score.reshape(-1).tolist()
        return score, predict, batch_meta['label']

    def save(self, filename):
        network_state = dict([(k, v.cpu())
                              for k, v in self.network.state_dict().items()])
        ema_state = dict([
            (k, v.cpu()) for k, v in self.ema.model.state_dict().items()
        ]) if self.ema is not None else dict()
        params = {
            'state': network_state,
            'optimizer': self.optimizer.state_dict(),
            'ema': ema_state,
            'config': self.config,
        }
        torch.save(params, filename)
        logger.info('model saved to {}'.format(filename))

    def cuda(self):
        self.network.cuda()
        if self.config['ema_opt']:
            self.ema.cuda()
예제 #6
0
    model.train()
    classifier.train()
    total_loss = 0
    for i, batch in enumerate(train_dataloader):

        data, mask = tensorized(batch[:, 0], vocab)
        label = torch.tensor(list(batch[:, 1])).to(DEVICE)
        data, mask = data.to(DEVICE), mask.to(DEVICE)
        output = model(data, mask)
        logit, loss = classifier(output, label)
        loss = loss.mean()

        loss = loss.mean() / accumulation_steps
        loss.backward()
        if (i + 1) % accumulation_steps == 0:
            optim.step()
            c_optim.step()
            optim.zero_grad()
            c_optim.zero_grad()

        total_loss += loss.item() * accumulation_steps

    model.eval()
    classifier.eval()
    with torch.no_grad():
        valid_loss = 0
        preds, labels = [], []
        for i, batch in enumerate(valid_dataloader):
            data, mask = tensorized(batch[:, 0], vocab)
            label = torch.tensor(list(batch[:, 1])).to(DEVICE)
            data, mask = data.to(DEVICE), mask.to(DEVICE)
def train_and_validate(model,
                       train_loader,
                       eval_loader,
                       tokenizer,
                       processor,
                       max_eps,
                       lr,
                       batch_size,
                       num_train_examples,
                       warmup,
                       print_every=10,
                       use_bert_adam=False,
                       log_training_info=True):
    torch.cuda.empty_cache()
    tr_loss = 0
    nb_tr_steps = 1

    criterion = nn.CrossEntropyLoss()

    if use_bert_adam:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate':
            0.0
        }]
        t_total = int((float(num_train_examples) / batch_size) * max_eps)
        opti = BertAdam(optimizer_grouped_parameters,
                        lr=lr,
                        warmup=warmup,
                        t_total=t_total)
    else:
        opti = optim.Adam(model.parameters(), lr=lr)

    categories = set(processor.categories.keys())

    # training
    if log_training_info:
        print("***** Running training *****")
        print(f"  Epochs = {max_eps}\n")
        print(f"  Num examples = {num_train_examples}")
        print(f"  Learning rate = {lr}")
        print(f"  Batch size = {batch_size}")
        print(
            f"  Categories = {categories if categories != CodahProcessor.get_all_categories() else 'all'}\n"
        )

    model.train()
    for ep in range(max_eps):
        tr_loss = 0
        for step, batch in enumerate(train_loader):
            # clear gradients
            model.zero_grad()

            # reshape and reduce the second dimension
            # pull label from training data, set aside for softmax
            n_batches = batch[0].shape[0]
            batch = [ids.view(ids.shape[0] * 4, -1) for ids in batch]

            # feedforward and loss calculation
            # batch = tuple(t.cuda() for t in batch)
            input_ids, input_mask, segment_ids, word_ids, word_lens, label_ids, _ = batch
            logits = model.forward(
                input_ids.cuda(), segment_ids.cuda(), input_mask.cuda(),
                word_ids.cuda(), word_lens.cuda(
                ))  #, label_ids) label removed to skip softmax in model
            logits = logits.view(-1, 4)  # reshape to (:, 4)
            loss = criterion(logits, label_ids.view(n_batches, 4)[:, 0].cuda())
            loss.backward()
            tr_loss += loss.item()
            nb_tr_steps += 1

            # optimization step
            opti.step()

            if (step + 1) % print_every == 0 and log_training_info:
                acc = accuracy(logits,
                               label_ids.view(n_batches, 4)[:, 0].cuda())
                print(
                    "Iteration {} of epoch {} complete. Loss : {} Accuracy : {}"
                    .format(step + 1, ep + 1, loss.item(), acc))

    # evaluation

    # eval_examples = processor.get_dev_examples()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    eval_category_acc = {key: 0 for key in categories}
    nb_eval_category_steps = {key: 0 for key in categories}
    model.eval()
    for input_ids, input_mask, segment_ids, word_ids, word_lens, label_ids, category_ids in eval_loader:
        input_ids = input_ids.view(input_ids.shape[0] * 4, -1).cuda()
        input_mask = input_mask.view(input_mask.shape[0] * 4, -1).cuda()
        segment_ids = segment_ids.view(segment_ids.shape[0] * 4, -1).cuda()
        word_ids = word_ids.view(word_ids.shape[0] * 4, -1).cuda()
        word_lens = word_lens.view(word_lens.shape[0] * 4, -1).cuda()
        label_ids = label_ids[:, 0].cuda()
        category_ids = category_ids[:, 0]

        with torch.no_grad():
            logits = model.forward(input_ids, segment_ids, input_mask,
                                   word_ids, word_lens).view(-1, 4)

        tmp_eval_loss = criterion(logits, label_ids)
        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.to('cpu').numpy()
        tmp_eval_accuracy = np_accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        eval_category_acc[processor.id_to_category[
            category_ids[0]]] += tmp_eval_accuracy
        nb_eval_category_steps[processor.id_to_category[category_ids[0]]] += 1

        nb_eval_examples += label_ids.shape[0]
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples

    result = {
        'eval_loss': eval_loss,
        'eval_accuracy': eval_accuracy,
        'tr_loss': tr_loss / nb_tr_steps
    }

    print("\n***** Eval results *****")
    for key in sorted(result.keys()):
        print(f"{key} = {str(result[key])}")

    print("\nresults by question category")
    for key in categories:
        eval_category_acc[key] /= nb_eval_category_steps[key]
        print(f"{key} = {eval_category_acc[key]}")

    return result
예제 #8
0
def train(args):

    #label_name = ['not related or not informative', 'other useful information', 'donations and volunteering','affected individuals', 'sympathy and support', 'infrastructure and utilities damage','caution and advice']

    label_name = ['Premise', 'Claim', 'None', 'MajorClaim']
    device = torch.device("cuda:0" if args['--cuda'] else "cpu")

    prefix = args['MODEL'] + '_' + args['BERT_CONFIG']

    bert_size = args['BERT_CONFIG'].split('-')[1]

    start_time = time.time()
    print('Importing data...', file=sys.stderr)
    df_train = pd.read_csv(args['--train'], index_col=0)
    df_val = pd.read_csv(args['--dev'], index_col=0)
    train_label = dict(df_train.InformationType_label.value_counts())
    label_max = float(max(train_label.values()))
    train_label_weight = torch.tensor(
        [label_max / train_label[i] for i in range(len(train_label))],
        device=device)
    print('Done! time elapsed %.2f sec' % (time.time() - start_time),
          file=sys.stderr)
    print('-' * 80, file=sys.stderr)

    start_time = time.time()
    print('Set up model...', file=sys.stderr)

    if args['MODEL'] == 'default':
        model = DefaultModel(args['BERT_CONFIG'], device, len(label_name))
        optimizer = BertAdam([{
            'params': model.bert.bert.parameters()
        }, {
            'params': model.bert.classifier.parameters(),
            'lr': float(args['--lr'])
        }],
                             lr=float(args['--lr-bert']),
                             max_grad_norm=float(args['--clip-grad']))
    elif args['MODEL'] == 'nonlinear':
        model = NonlinearModel(args['BERT_CONFIG'], device, len(label_name),
                               float(args['--dropout']))
        optimizer = BertAdam([{
            'params': model.bert.parameters()
        }, {
            'params': model.linear1.parameters(),
            'lr': float(args['--lr'])
        }, {
            'params': model.linear2.parameters(),
            'lr': float(args['--lr'])
        }, {
            'params': model.linear3.parameters(),
            'lr': float(args['--lr'])
        }],
                             lr=float(args['--lr-bert']),
                             max_grad_norm=float(args['--clip-grad']))
    elif args['MODEL'] == 'lstm':
        model = CustomBertLSTMModel(args['BERT_CONFIG'],
                                    device,
                                    float(args['--dropout']),
                                    len(label_name),
                                    lstm_hidden_size=int(
                                        args['--hidden-size']))
        optimizer = BertAdam([{
            'params': model.bert.parameters()
        }, {
            'params': model.lstm.parameters(),
            'lr': float(args['--lr'])
        }, {
            'params': model.hidden_to_softmax.parameters(),
            'lr': float(args['--lr'])
        }],
                             lr=float(args['--lr-bert']),
                             max_grad_norm=float(args['--clip-grad']))
    elif args['MODEL'] == 'cnn':
        model = CustomBertConvModel(args['BERT_CONFIG'],
                                    device,
                                    float(args['--dropout']),
                                    len(label_name),
                                    out_channel=int(args['--out-channel']))
        optimizer = BertAdam([{
            'params': model.bert.parameters()
        }, {
            'params': model.conv.parameters(),
            'lr': float(args['--lr'])
        }, {
            'params': model.hidden_to_softmax.parameters(),
            'lr': float(args['--lr'])
        }],
                             lr=float(args['--lr-bert']),
                             max_grad_norm=float(args['--clip-grad']))
    else:
        print('please input a valid model')
        exit(0)

    model = model.to(device)
    print('Use device: %s' % device, file=sys.stderr)
    print('Done! time elapsed %.2f sec' % (time.time() - start_time),
          file=sys.stderr)
    print('-' * 80, file=sys.stderr)

    model.train()

    cn_loss = torch.nn.CrossEntropyLoss(weight=train_label_weight,
                                        reduction='mean')
    torch.save(cn_loss, 'loss_func')  # for later testing

    train_batch_size = int(args['--batch-size'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = prefix + '_model.bin'

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = 0
    cum_examples = report_examples = epoch = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('Begin Maximum Likelihood training...')

    while True:
        epoch += 1

        for sents, targets in batch_iter(df_train,
                                         batch_size=train_batch_size,
                                         shuffle=True,
                                         bert=bert_size):  # for each epoch
            train_iter += 1

            optimizer.zero_grad()

            batch_size = len(sents)

            pre_softmax = model(sents)

            loss = cn_loss(
                pre_softmax,
                torch.tensor(targets, dtype=torch.long, device=device))

            loss.backward()

            optimizer.step()

            batch_losses_val = loss.item() * batch_size
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, '
                      'cum. examples %d, speed %.2f examples/sec, '
                      'time elapsed %.2f sec' %
                      (epoch, train_iter, report_loss / report_examples,
                       cum_examples, report_examples /
                       (time.time() - train_time), time.time() - begin_time),
                      file=sys.stderr)

                train_time = time.time()
                report_loss = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. examples %d' %
                    (epoch, train_iter, cum_loss / cum_examples, cum_examples),
                    file=sys.stderr)

                cum_loss = cum_examples = 0.

                print('begin validation ...', file=sys.stderr)

                validation_loss = validation(
                    model, df_val, bert_size, cn_loss,
                    device)  # dev batch size can be a bit larger

                print('validation: iter %d, loss %f' %
                      (train_iter, validation_loss),
                      file=sys.stderr)

                is_better = len(
                    hist_valid_scores
                ) == 0 or validation_loss < min(hist_valid_scores)
                hist_valid_scores.append(validation_loss)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' %
                          model_save_path,
                          file=sys.stderr)

                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(),
                               model_save_path + '.optim')
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        print(
                            'load previously best model and decay learning rate to %f%%'
                            % (float(args['--lr-decay']) * 100),
                            file=sys.stderr)

                        # load model
                        params = torch.load(
                            model_save_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        optimizer.load_state_dict(
                            torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] *= float(args['--lr-decay'])

                        # reset patience
                        patience = 0

                if epoch == int(args['--max-epoch']):
                    print('reached maximum number of epochs!', file=sys.stderr)
                    exit(0)
예제 #9
0
def single_train(config):
    start_time = time.asctime(time.localtime(time.time()))
    print(start_time)
    x_trian = []
    y_train = []
    head_train = []
    relations = [
        "unknown", "Create", "Use", "Near", "Social", "Located", "Ownership",
        "General-Special", "Family", "Part-Whole"
    ]
    with open(config.train_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            # label = config.label_list.index(line.split(" ")[0].strip("__label__"))
            # sentence = "".join(line.lower().strip("\n").split(" ")[1:])
            _, head, tail, label, sentence = line.split("\t")
            if (sentence):
                head_pos = (re.search('\[E11\]', sentence).span()[1],
                            re.search('\[E12\]', sentence).span()[0])
                # tail_pos = (re.search('\[E21\]', line).span()[1], re.search('\[E22\]', line).span()[0])
                sentence = sentence.replace('[E21]', '')
                sentence = sentence.replace('[E22]', '')
                x_trian.append(sentence)
                y_train.append(relations.index(label))
                head_train.append(head_pos)
    print('handle data over.')
    torch_dataset = ListDataset(x=x_trian, y=y_train, head=head_train)
    loader = Data.DataLoader(
        dataset=torch_dataset,
        batch_size=config.batch_size,  # 批大小
        shuffle=True,  # 是否随机打乱顺序
        num_workers=4,  # 多线程读取数据的线程数
    )
    # model = Model(config).to(config.device)
    model = SingleModel(config).to(config.device)
    optimizer = BertAdam(model.parameters(),
                         lr=config.lr,
                         warmup=0.05,
                         t_total=len(torch_dataset) * config.num_epoches)
    loss_func = torch.nn.CrossEntropyLoss()
    loss_li = []
    print_loss = 0
    for epoch in range(config.num_epoches):
        model.train()
        for step, (batch_texts, batch_span,
                   batch_head_pos) in enumerate(loader):
            max_len = max([len(i) for i in batch_texts])
            x = config.tokenizer.batch_encode_plus(batch_texts,
                                                   add_special_tokens=True,
                                                   return_tensors="pt",
                                                   max_length=max_len,
                                                   pad_to_max_length=True)
            x["input_ids"] = x["input_ids"].to(config.device)
            x["attention_mask"] = torch.abs(
                torch.ones(x["token_type_ids"].size(), dtype=torch.long).to(
                    config.device) - x["token_type_ids"].to(config.device))
            x["token_type_ids"] = x["token_type_ids"].to(config.device)
            out = model(input_ids=x["input_ids"],
                        attention_mask=x["attention_mask"],
                        token_type_ids=x["token_type_ids"],
                        batch_head_pos=batch_head_pos)
            # print(loss)
            # print(torch.argmax(start[0]), torch.argmax(end[0]))
            optimizer.zero_grad()
            loss = loss_func(out,
                             batch_span.to(config.device)).to(config.device)
            print_loss += loss
            loss.backward()
            optimizer.step()
            if (step + 1) % 10 == 0:
                print("epoch:", epoch, "step:", step, "loss", print_loss / 10)
                # print(config.tokenizer.decode(x["input_ids"][1]))
                # print(x["input_ids"][1])
                # print(x["attention_mask"][1])
                # print(x["token_type_ids"][1])
                # print(batch_question_doc[0][0])
                # print(torch.argmax(start[0]), torch.argmax(end[0]))
                # print(config.tokenizer.decode(x["input_ids"][0][torch.argmax(start[0]):torch.argmax(end[0])]))
                # print('real', batch_span[0][0], batch_span[1][0])
                # print(config.tokenizer.decode(x["input_ids"][0][batch_span[0][0]: batch_span[1][0]]))
                loss_li.append(print_loss / 50)
                print_loss = 0
    model_path = '/usr/tdq/models/re/aliBert-Sanwen-10'
    torch.save(model, model_path)
    end_time = time.asctime(time.localtime(time.time()))
    print("start time:{}, end time:{}".format(start_time, end_time))
    return model_path