예제 #1
0
def test():
    # 配置文件
    cf = Config('./config.yaml')
    # 有GPU用GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 测试数据
    test_data = NewsDataset("./data/cnews_final_test.txt", cf.max_seq_len)
    test_dataloader = DataLoader(test_data,
                                 batch_size=cf.batch_size,
                                 shuffle=True)

    # 模型
    config = BertConfig("./output/pytorch_bert_config.json")
    model = BertForSequenceClassification(config, num_labels=cf.num_labels)
    model.load_state_dict(torch.load("./output/pytorch_model.bin"))

    # 把模型放到指定设备
    model.to(device)

    # 让模型并行化运算
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # 训练
    start_time = time.time()

    data_len = len(test_dataloader)

    model.eval()
    y_pred = np.array([])
    y_test = np.array([])
    # for step,batch in enumerate(tqdm(test_dataloader,"batch",total=len(test_dataloader))):
    for step, batch in enumerate(test_dataloader):
        label_id = batch['label_id'].squeeze(1).to(device)
        word_ids = batch['word_ids'].to(device)
        segment_ids = batch['segment_ids'].to(device)
        word_mask = batch['word_mask'].to(device)

        loss = model(word_ids, segment_ids, word_mask, label_id)

        with torch.no_grad():
            pred = get_model_labels(model, word_ids, segment_ids, word_mask)
        y_pred = np.hstack((y_pred, pred))
        y_test = np.hstack((y_test, label_id.to("cpu").numpy()))

    # 评估
    print("Precision, Recall and F1-Score...")
    print(
        metrics.classification_report(y_test,
                                      y_pred,
                                      target_names=get_labels('./data/label')))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test, y_pred)
    print(cm)
예제 #2
0
 def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
     model = BertForSequenceClassification(config=config, num_labels=self.num_labels)
     model.eval()
     loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
     logits = model(input_ids, token_type_ids, input_mask)
     outputs = {
         "loss": loss,
         "logits": logits,
     }
     return outputs
    torch.save(model.state_dict(),
               output_model_file + '_epoch_' + str(epoch) + '.bin')

    #validate
    test_model = BertForSequenceClassification(bert_config,
                                               num_labels=len(y_columns))

    #paralleism
    test_model = nn.DataParallel(test_model)

    test_model.load_state_dict(
        torch.load(output_model_file + '_epoch_' + str(epoch) + '.bin'))
    test_model.to(device)
    for param in test_model.parameters():
        param.requires_grad = False
    test_model.eval()
    valid_preds = np.zeros((len(X_val)))
    print(valid_preds.size)
    valid = torch.utils.data.TensorDataset(
        torch.tensor(X_val, dtype=torch.long))
    valid_loader = torch.utils.data.DataLoader(valid,
                                               batch_size=256,
                                               shuffle=False)

    tk0 = tqdm(valid_loader)
    for i, (x_batch, ) in enumerate(tk0):
        pred = test_model(x_batch.to(device),
                          attention_mask=(x_batch > 0).to(device),
                          labels=None)
        valid_preds[i * 256:(i + 1) *
                    256] = pred[:, 0].detach().cpu().squeeze().numpy()
예제 #4
0
class TransformersClassifierHandler(BaseHandler, ABC):
    """
    Transformers text classifier handler class. This handler takes a text (string) and
    as input and returns the classification text based on the serialized transformers checkpoint.
    """
    def __init__(self):
        super(TransformersClassifierHandler, self).__init__()
        self.initialized = False

    def initialize(self, ctx):
        properties = ctx.system_properties
        MODEL_DIR = properties.get("model_dir")
        self.device = torch.device("cuda:" +
                                   str(properties.get("gpu_id")) if torch.cuda.
                                   is_available() else "cpu")
        self.labelencoder = preprocessing.LabelEncoder()
        self.labelencoder.classes_ = np.load(
            os.path.join(MODEL_DIR, 'classes.npy'))
        config = BertConfig(os.path.join(MODEL_DIR, 'bert_config.json'))
        self.model = BertForSequenceClassification(
            config, num_labels=len(self.labelencoder.classes_))
        self.model.load_state_dict(
            torch.load(os.path.join(MODEL_DIR, 'pytorch_model.bin'),
                       map_location="cpu"))
        self.model.to(self.device)
        self.model.eval()

        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.softmax = torch.nn.Softmax(dim=-1)
        # self.batch_size = batch_size

        logger.debug(
            'Transformer model from path {0} loaded successfully'.format(
                MODEL_DIR))
        self.manifest = ctx.manifest
        self.initialized = True

    def preprocess(self, data):
        ids = []
        segment_ids = []
        input_masks = []
        MAX_LEN = 128

        for sen in data:
            text_tokens = self.tokenizer.tokenize(sen)
            tokens = ["[CLS]"] + text_tokens + ["[SEP]"]
            temp_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(temp_ids)
            segment_id = [0] * len(temp_ids)
            padding = [0] * (MAX_LEN - len(temp_ids))

            temp_ids += padding
            input_mask += padding
            segment_id += padding

            ids.append(temp_ids)
            input_masks.append(input_mask)
            segment_ids.append(segment_id)

        ## Convert input list to Torch Tensors
        ids = torch.tensor(ids)
        segment_ids = torch.tensor(segment_ids)
        input_masks = torch.tensor(input_masks)
        validation_data = TensorDataset(ids, input_masks, segment_ids)
        validation_sampler = SequentialSampler(validation_data)
        validation_dataloader = DataLoader(
            validation_data,
            sampler=validation_sampler,
            batch_size=len(data),
            num_workers=self.dataloader_num_workers)

        return validation_dataloader

    def inference(self, validation_dataloader):
        """
        Predict the class of a text using a trained transformer model.
        """
        # NOTE: This makes the assumption that your model expects text to be tokenized
        # with "input_ids" and "token_type_ids" - which is true for some popular transformer models, e.g. bert.
        # If your transformer model expects different tokenization, adapt this code to suit
        # its expected input format.
        responses = []
        for batch in validation_dataloader:
            # Add batch to GPU
            batch = tuple(t.to(self.device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch
            with torch.no_grad():
                # Forward pass, calculate logit predictions
                logits = self.model(b_input_ids,
                                    token_type_ids=None,
                                    attention_mask=b_input_mask)
                for i in range(logits.size(0)):
                    label_idx = [
                        self.softmax(
                            logits[i]).detach().cpu().numpy().argmax()
                    ]
                    label_str = self.labelencoder.inverse_transform(
                        label_idx)[0]
                    responses.append(label_str)

        return responses

    def postprocess(self, inference_output):
        # TODO: Add any needed post-processing of the model predictions here
        return inference_output
예제 #5
0
            optimizer.step()                            # Now we can do an optimizer step
            optimizer.zero_grad()
        if lossf:
            lossf = 0.98*lossf+0.02*loss.item()
        else:
            lossf = loss.item()
        tk0.set_postfix(loss = lossf)
        avg_loss += loss.item() / len(train_loader)
        avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:,0])>0.5) == (y_batch[:,0]>0.5).to(device)).to(torch.float) ).item()/len(train_loader)
    tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)


torch.save(model.state_dict(), output_model_file)

# Run validation
# The following 2 lines are not needed but show how to download the model for prediction
model = BertForSequenceClassification(bert_config,num_labels=len(y_columns))
model.load_state_dict(torch.load(output_model_file ))
model.to(device)
for param in model.parameters():
    param.requires_grad=False
model.eval()
valid_preds = np.zeros((len(X_val)))
valid = torch.utils.data.TensorDataset(torch.tensor(X_val,dtype=torch.long))
valid_loader = torch.utils.data.DataLoader(valid, batch_size=32, shuffle=False)

tk0 = tqdm_notebook(valid_loader)
for i,(x_batch,)  in enumerate(tk0):
    pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
    valid_preds[i*32:(i+1)*32]=pred[:,0].detach().cpu().squeeze().numpy()
예제 #6
0
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH)
x_test = convert_lines(x_test,MAX_LEN,tokenizer)


x_test_cuda = torch.tensor(x_test, dtype=torch.long).cuda()
test_data = torch.utils.data.TensorDataset(x_test_cuda)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

## load fine-tuned model
bert_config = BertConfig('../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/bert_config.json')
net = BertForSequenceClassification(bert_config,num_labels=6)
net.load_state_dict(torch.load("../input/bert-model3/bert_pytorch_v3.pt"))
net.cuda()

## inference
net.eval()
result_1 = list()
with torch.no_grad():
  for (x_batch,) in test_loader:
    y_pred = net(x_batch)
    y_pred = torch.sigmoid(y_pred.cpu()).numpy()[:,0]
    result_1.extend(y_pred)
result_1 = np.array(result_1)



net = BertForSequenceClassification(bert_config,num_labels=6)
net.load_state_dict(torch.load("../input/bert-model4/bert_pytorch_v4.pt"))
net.cuda()

## inference
예제 #7
0
def train_unfixed():
    # 配置文件
    cf = Config('./config.yaml')
    # 有GPU用GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 训练数据
    train_data = NewsDataset("./data/cnews_final_train.txt", cf.max_seq_len)
    train_dataloader = DataLoader(train_data,
                                  batch_size=cf.batch_size,
                                  shuffle=True)
    # 测试数据
    test_data = NewsDataset("./data/cnews_final_test.txt", cf.max_seq_len)
    test_dataloader = DataLoader(test_data,
                                 batch_size=cf.batch_size,
                                 shuffle=True)

    # 模型
    config = BertConfig("./output/pytorch_bert_config.json")
    model = BertForSequenceClassification(config, num_labels=cf.num_labels)
    model.load_state_dict(torch.load("./output/pytorch_model.bin"))

    # 优化器用adam
    for param in model.parameters():
        param.requires_grad = True
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    num_train_optimization_steps = int(
        len(train_data) / cf.batch_size) * cf.epoch
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=cf.lr,
                         t_total=num_train_optimization_steps)

    # 把模型放到指定设备
    model.to(device)

    # 让模型并行化运算
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # 训练
    start_time = time.time()

    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1500  # 如果超过1500轮未提升,提前结束训练

    # 获取当前验证集acc
    model.eval()
    _, best_acc_val = evaluate(model, test_dataloader, device)

    flag = False
    model.train()
    for epoch_id in range(cf.epoch):
        print("Epoch %d" % epoch_id)
        for step, batch in enumerate(
                tqdm(train_dataloader,
                     desc="batch",
                     total=len(train_dataloader))):
            # for step,batch in enumerate(train_dataloader):

            label_id = batch['label_id'].squeeze(1).to(device)
            word_ids = batch['word_ids'].to(device)
            segment_ids = batch['segment_ids'].to(device)
            word_mask = batch['word_mask'].to(device)

            loss = model(word_ids, segment_ids, word_mask, label_id)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_batch += 1

            if total_batch % cf.print_per_batch == 0:
                model.eval()
                with torch.no_grad():
                    loss_train, acc_train = get_model_loss_acc(
                        model, word_ids, segment_ids, word_mask, label_id)
                loss_val, acc_val = evaluate(model, test_dataloader, device)

                if acc_val > best_acc_val:
                    # 保存最好结果
                    best_acc_val = acc_val
                    last_improved = total_batch

                    torch.save(model.state_dict(),
                               "./output/pytorch_model.bin")
                    with open("./output/pytorch_bert_config.json", 'w') as f:
                        f.write(model.config.to_json_string())

                    improved_str = "*"
                else:
                    improved_str = ""

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(
                    msg.format(total_batch, loss_train, acc_train, loss_val,
                               acc_val, time_dif, improved_str))

                model.train()

            if total_batch - last_improved > require_improvement:
                print("长时间未优化")
                flag = True
                break
        if flag:
            break