示例#1
0
def bert_baseline(arg):
    from bert_config import bert_parameter_dict
    version = arg.model

    parameters = bert_parameter_dict[version]
    batch_size = parameters["batch_size"]
    epoch_num = parameters["epoch_num"]
    learning_rate = parameters["learning_rate"]
    device = parameters["device"]
    early_stop_epoch = parameters["early_stop_epoch"]

    dl_model_dir = os.path.join(model_dir, version)
    create_dir(dl_model_dir)

    data_cached_path = os.path.join(cache_dir, version + ".h5")
    if os.path.isfile(data_cached_path):
        x_train, y_train, x_test, y_test, x_dev, y_dev = h5_load(
            data_cached_path,
            ["x_train", "y_train", "x_test", "y_test", "x_dev", "y_dev"],
            dtype=np.int32,
            verbose=True)

    else:
        # load data
        x_train, y_train = load_data(phrase="train", verbose=True)
        x_test, y_test = load_data(phrase="test", verbose=True)
        x_dev, y_dev = load_data(phrase="dev", verbose=True)

        # turn text into ids
        if version == "bert":
            tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
        elif version == "sci-bert":
            tokenizer = AutoTokenizer.from_pretrained(
                'allenai/scibert_scivocab_uncased')
        tokenizer.save_pretrained(dl_model_dir)

        feature = Feature(tokenizer=tokenizer)
        x_train = feature.extract(x_train[:])
        x_test = feature.extract(x_test[:])
        x_dev = feature.extract(x_dev[:])

        # turn label into vector
        y_train = np.array([label_mapping[y] for y in y_train])
        y_test = np.array([label_mapping[y] for y in y_test])
        y_dev = np.array([label_mapping[y] for y in y_dev])

        # cache data
        with h5py.File(data_cached_path, 'w') as outfile:
            outfile.create_dataset("x_train", data=x_train)
            outfile.create_dataset("y_train", data=y_train)
            outfile.create_dataset("x_test", data=x_test)
            outfile.create_dataset("y_test", data=y_test)
            outfile.create_dataset("x_dev", data=x_dev)
            outfile.create_dataset("y_dev", data=y_dev)

    print("Train", x_train.shape, y_train.shape)
    print("Test", x_test.shape, y_test.shape)
    print("Valid", x_dev.shape, y_dev.shape)

    #subset_num = 1000
    #x_train, y_train = x_train[:subset_num], y_train[:subset_num]
    #x_dev, y_dev = x_dev[:subset_num], y_dev[:subset_num]
    #x_test, y_test = x_test[:subset_num], y_test[:subset_num]

    train_dataset = CovidDataset(x_train, y_train)
    test_dataset = CovidDataset(x_test, y_test)
    dev_dataset = CovidDataset(x_dev, y_dev)
    training = data.DataLoader(train_dataset,
                               batch_size=batch_size,
                               shuffle=True,
                               num_workers=4)
    testing = data.DataLoader(test_dataset,
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=4)
    dev = data.DataLoader(dev_dataset,
                          batch_size=batch_size,
                          shuffle=False,
                          num_workers=4)

    # model
    if version == "bert":
        print("Using Bert!!!")
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased', num_labels=5).to(device)
    elif version == "sci-bert":
        print("Using SCI-Bert!!!")
        #config = BertConfig(vocab_size=31090, num_labels=5)
        config = AutoConfig.from_pretrained('allenai/scibert_scivocab_uncased')
        config.num_labels = 5
        model = AutoModelForSequenceClassification.from_pretrained(
            'allenai/scibert_scivocab_uncased', config=config).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    acc, _, _ = evaluate(model, dev, device=device)
    best_model = None
    best_accuracy = 0.0
    best_epoch = 0
    stopper = EarlyStop(mode="max", history=early_stop_epoch)

    for epoch in range(1, epoch_num + 1):
        model.train()
        total_loss = 0
        total_acc = 0
        total_count = len(train_dataset) // batch_size
        for count, (x_batch, y_batch) in enumerate(training, 1):
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(x_batch, labels=y_batch)
            loss, y_pred = outputs[0:2]
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            # compute accuracy
            y_pred = torch.argmax(y_pred, dim=1)
            correct_num = torch.sum(y_pred == y_batch).double()
            total_acc += correct_num / y_pred.shape[0]

            print("\x1b[2K\rEpoch: {} / {} [{:.2f}%] Loss: {:.5f} Acc: {:.5f}".
                  format(epoch, epoch_num, 100.0 * count / total_count,
                         total_loss / count, total_acc / count),
                  end="")

        print()
        if epoch % 1 == 0:
            acc, _, _ = evaluate(model, dev, device=device)

            if acc > best_accuracy:
                best_model = copy.deepcopy(model.state_dict())
                best_accuracy = acc
                best_epoch = epoch

            # check early stopping
            if stopper.check(acc):
                print("Early Stopping at Epoch = ", epoch)
                break

    # load best model & test & save
    print("loading model from epoch {}".format(best_epoch))
    #torch.save(best_model, os.path.join(dl_model_dir, "best_model.pt"))
    model.load_state_dict(best_model)
    model.save_pretrained(dl_model_dir)
    acc, predict, true_label = evaluate(model, testing, device=device)
    score = precision_recall_fscore_support(true_label, predict)
    table = output_score(score)
    print(table)

    # output result
    with open(os.path.join(result_dir, "{}.result".format(version)),
              'w',
              encoding='utf-8') as outfile:
        outfile.write(table.to_csv(path_or_buf=None) + "\n")
        outfile.write("acc = {}\n".format(acc))
示例#2
0
def train_process(config, train_load, valid_load, test_load, k, train_sampler):

    # load source bert weights
    # model_config = BertConfig.from_pretrained(pretrained_model_name_or_path="../user_data/bert_source/{}/config.json".format(config.model_name))
    model_config = BertConfig()
    model_config.vocab_size = len(
        pd.read_csv('../user_data/vocab', names=["score"]))
    model = BertForSequenceClassification(config=model_config)

    if os.path.isfile('save_model/{}_best_model_v1111.pth.tar'.format(
            config.model_name)):
        checkpoint = torch.load('save_model/{}_best_model_v1.pth.tar'.format(
            config.model_name),
                                map_location=torch.device('cpu'))
        model.load_state_dict(checkpoint['status'], strict=False)
        best_dev_auc = 0
        print('***********load best model weight*************')
    else:
        checkpoint = torch.load(
            '../user_data/save_bert/{}_checkpoint.pth.tar'.format(
                config.model_name),
            map_location=torch.device('cpu'))
        model.load_state_dict(checkpoint['status'], strict=False)
        best_dev_auc = 0
        print('***********load pretrained mlm model weight*************')

    for param in model.parameters():
        param.requires_grad = True

    # 4) 封装之前要把模型移到对应的gpu
    model = model.to(config.device)

    no_decay = ["bias", "LayerNorm.weight"]

    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            config.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate)

    #     t_total = len(train_load) * config.num_train_epochs
    #     scheduler = get_linear_schedule_with_warmup(
    #         optimizer, num_warmup_steps=t_total * config.warmup_proportion, num_training_steps=t_total
    #     )

    cudnn.benchmark = True

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        # 5)封装
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[config.local_rank])

    model.train()
    if config.fgm:
        fgm = FGM(model)

    for epoch in range(config.num_train_epochs):
        train_sampler.set_epoch(epoch)
        is_best = False
        torch.cuda.empty_cache()

        for batch, (input_ids, token_type_ids, attention_mask,
                    label) in enumerate(train_load):
            input_ids = input_ids.cuda(config.local_rank, non_blocking=True)
            attention_mask = attention_mask.cuda(config.local_rank,
                                                 non_blocking=True)
            token_type_ids = token_type_ids.cuda(config.local_rank,
                                                 non_blocking=True)
            label = label.cuda(config.local_rank, non_blocking=True)

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            labels=label)

            loss = outputs.loss
            model.zero_grad()
            loss.backward()
            #             torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)

            if config.fgm:
                fgm.attack()  # 在embedding上添加对抗扰动
                loss_adv = model(input_ids=input_ids,
                                 attention_mask=attention_mask,
                                 token_type_ids=token_type_ids,
                                 labels=label).loss
                loss_adv.backward()  # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
                fgm.restore()  # 恢复embedding参数

            optimizer.step()
        #             scheduler.step()

        dev_auc = model_evaluate(config, model, valid_load)

        # 同步各个进程的速度,计算分布式loss
        torch.distributed.barrier()
        reduce_dev_auc = reduce_auc(dev_auc, config.nprocs).item()

        if reduce_dev_auc > best_dev_auc:
            best_dev_auc = reduce_dev_auc
            is_best = True

        now = strftime("%Y-%m-%d %H:%M:%S", localtime())
        msg = 'number {} fold,time:{},epoch:{}/{},reduce_dev_auc:{},best_dev_auc:{}'

        if config.local_rank in [0, -1]:
            print(
                msg.format(k, now, epoch + 1, config.num_train_epochs,
                           reduce_dev_auc, best_dev_auc))
            checkpoint = {
                "status": model.state_dict(),
                "epoch": epoch + 1,
                'reduce_dev_auc': reduce_dev_auc
            }
            if is_best:
                torch.save(
                    checkpoint, '../user_data/save_model' + os.sep +
                    '{}_best_model.pth.tar'.format(config.model_name))
            torch.save(
                checkpoint, '../user_data/save_model' + os.sep +
                '{}_checkpoint.pth.tar'.format(config.model_name))
            del checkpoint

    torch.distributed.barrier()
示例#3
0
def test_24_hour_model(all_examples):
    input_ids, attention_masks, labels = tokenize_all_examples(all_examples)

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    # Combine the training inputs into a TensorDataset.
    dataset = TensorDataset(input_ids, attention_masks, labels)

    # The DataLoader needs to know our batch size for training, so we specify it
    # here. For fine-tuning BERT on a specific task, the authors recommend a batch
    # size of 16 or 32.
    batch_size = 32

    # For validation the order doesn't matter, so we'll just read them sequentially.
    validation_dataloader = DataLoader(
        dataset,  # The validation samples.
        sampler=SequentialSampler(dataset),  # Pull out batches sequentially.
        batch_size=batch_size  # Evaluate with this batch size.
    )

    # Load BertForSequenceClassification, the pretrained BERT model with a single
    # linear classification layer on top.
    model = BertForSequenceClassification.from_pretrained(
        # Use the 12-layer BERT model, with an uncased vocab.
        "./bert_models/full_24_hour_model/",
        num_labels=24,  # 24 class model for us
        # Whether the model returns attentions weights.
        output_attentions=False,
        # Whether the model returns all hidden-states.
        output_hidden_states=False,
    )

    # Tell pytorch to run this model on the GPU.
    model.cuda()

    # Put model in evaluation mode
    model.eval()

    # Tracking variables
    predictions, true_labels = [], []

    # Predict
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients, saving memory and
        # speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)
    return predictions, true_labels
示例#4
0
def predict(text):
    device = setup()

    preprocessed_text = text_cleansing(text)
    print('Preprocessed text:', preprocessed_text)

    print('Loading model...')
    model = BertForSequenceClassification.from_pretrained(
        app.config["MODEL_2_PATH"])
    tokenizer = BertTokenizer.from_pretrained(app.config["MODEL_2_PATH"])

    # Copy the model to the GPU.
    model.to(device)
    print('Model has loaded.')

    encoded_dict = tokenizer.encode_plus(
        preprocessed_text,  # Sentence to encode.
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        max_length=128,  # Pad & truncate all sentences.
        return_attention_mask=True,  # Construct attn. masks.
        return_tensors='pt',  # Return pytorch tensors.
        truncation=True,
        padding='max_length')

    # encoded_dict = encoded_dict.to(device)

    # Add the encoded sentence to the list.
    input_ids = encoded_dict['input_ids']
    input_ids = input_ids.to(device)

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks = encoded_dict['attention_mask']
    attention_masks = attention_masks.to(device)

    model.eval()

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)
        # outputs.to(device)

    print('Outputs:', outputs)

    logits = outputs[0]
    softmax = torch.nn.functional.softmax(logits)

    logits = logits.detach().cpu().numpy()
    softmax = softmax.detach().cpu().numpy()

    print('Logits:', logits)
    print('Softmax:', softmax)

    label_id = np.argmax(logits, axis=1).flatten()
    percentage = np.max(softmax * 100)

    if label_id == 0:
        label_name = 'Non-Kekerasan'
    elif label_id == 1:
        label_name = 'Kekerasan'

    prediction = 'Konten ini adalah {} ({:.0f}%)'.format(
        label_name, percentage)
    print(prediction)

    return prediction  # prediction
示例#5
0
def main(args):
    """
    """
    # Create output dir if none mentioned.
    if args.output_dir is None:
        model_name = os.path.splitext(os.path.basename(
            args.model_name_or_path))[0]
        args.output_dir = "./output/" + model_name + '/'
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    print("\n========================================")
    print('                  MODEL                   ')
    print("========================================")
    print("Loading BertForSequenceClassification model...")
    model = BertForSequenceClassification.from_pretrained(
        args.
        model_name_or_path,  # Use the 12-layer BERT model, with a cased vocab.
        num_labels=args.num_labels,  # The number of output labels
        output_attentions=False,  # Whether the model returns attentions weights.
        output_hidden_states=
        False,  # Whether the model returns all hidden-states.
        cache_dir=args.cache_dir,
    )
    print('Loading BertTokenizer...')
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path,
                                              do_lower_case=False)

    print("Setting up CUDA & GPU...")
    if torch.cuda.is_available():
        if args.gpu_id is not None:
            torch.cuda.set_device(args.gpu_id)
            args.n_gpu = 1
            print("  - GPU {} {} will be used.".format(
                torch.cuda.get_device_name(args.gpu_id), args.gpu_id))
        else:
            args.n_gpu = torch.cuda.device_count()
            gpu_ids = list(range(0, args.n_gpu))
            if args.n_gpu > 1:
                model = torch.nn.DataParallel(model,
                                              device_ids=gpu_ids,
                                              output_device=gpu_ids[-1])
            print("  - GPU(s) {} will be used.".format(str(gpu_ids)))
        args.device = torch.device("cuda")
    else:
        args.device = torch.device("cpu")
        args.n_gpu = 0
        print("  - No GPU available, using the CPU instead.")
    model.to(args.device)

    # Set the seed value all over the place to make this reproducible.
    set_seed(args.seed)

    print("\n========================================")
    print('                  DATA                    ')
    print("========================================")
    print("Loading data...")
    classes_of_interest = [
        'Data Sheets', 'Configuration (Guides, Examples & TechNotes)',
        'Install & Upgrade Guides', 'Release Notes', 'End User Guides'
    ]
    df, categories = load_data(args, classes_of_interest)
    sentences = df.Sentence.values
    classes = df.Class.values
    class_ids = df.Class_id.values
    print('  - Number of sentences: {:,}'.format(df.shape[0]))
    print('  - Number of doc types: {:,}'.format(len(categories)))
    for i, cat in enumerate(categories):
        print("     * {} : {}".format(cat, i))

    print("Tokenizing sentences...")
    tokenized = tokenize_sentences(tokenizer, df)
    attention_masks = create_masks(tokenized)

    print("Splitting dataset...")
    dataset = (tokenized, class_ids, attention_masks, sentences)
    train_set, val_set, test_set = split_data(args, dataset)
    print("  - Samples in train set: {}".format(len(train_set[0])))
    train_ids = Counter(train_set[1]).keys()
    train_ids_freq = Counter(train_set[1]).values()
    for i, freq in zip(train_ids, train_ids_freq):
        print("     * {} : {}".format(i, freq))
    print("  - Samples in val set: {}".format(len(val_set[0])))
    val_ids = Counter(val_set[1]).keys()
    val_ids_freq = Counter(val_set[1]).values()
    for i, freq in zip(val_ids, val_ids_freq):
        print("     * {} : {}".format(i, freq))
    print("  - Samples in test set: {}".format(len(test_set[0])))
    test_ids = Counter(test_set[1]).keys()
    test_ids_freq = Counter(test_set[1]).values()
    for i, freq in zip(test_ids, test_ids_freq):
        print("     * {} : {}".format(i, freq))

    if args.do_train:
        print("\n========================================")
        print('               TRAINING                   ')
        print("========================================")
        model = train(args, model, tokenizer, categories, train_set, val_set)

    if args.do_test:
        print("\n========================================")
        print('                TESTING                   ')
        print("========================================")
        print("Evaluation on entire test set...")
        result, df_wrong, df_right = evaluate(args, model, categories,
                                              test_set)
        plot_confusion_matrix(result['conf_matrix'], categories,
                              args.output_dir)
        df_wrong.to_csv(os.path.join(args.output_dir, 'preds_wrong.csv'))
        df_right.to_csv(os.path.join(args.output_dir, 'preds_right.csv'))
        with open(os.path.join(args.output_dir, 'test_set_scores.json'),
                  'w+') as f:
            json.dump(result, f)
        print("  * Accuracy: {0:.6f}".format(result['Accuracy']))
        print("  * MCC: {0:.6f}".format(result['MCC']))
        print("  Macro Average")
        print("  * Recall: {0:.6f}".format(result['Macro_Average']['Recall']))
        print("  * Precision: {0:.6f}".format(
            result['Macro_Average']['Precision']))
        print("  * F1 score: {0:.6f}".format(result['Macro_Average']['F1']))
        print("  Weighted Average")
        print("  * Recall: {0:.6f}".format(
            result['Weighted_Average']['Recall']))
        print("  * Precision: {0:.6f}".format(
            result['Weighted_Average']['Precision']))
        print("  * F1 score: {0:.6f}".format(result['Weighted_Average']['F1']))

        print("Evaluation on bootstrap samples from test set...")
        stats = bootstrap_evaluation(args, model, categories, test_set, 100)
        with open(os.path.join(args.output_dir, 'bootstrap_scores.json'),
                  'w+') as f:
            json.dump(stats, f)

        if args.do_compare:
            print("Evaluation on BERT predictions...")
            evaluate_bert_preds(args, model, tokenizer, categories)
示例#6
0


# =============================================================================
# Define model
# =============================================================================


from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = len(set(labels.numpy())), # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()


# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())



# =============================================================================
# Optimizer & Learning Rate Scheduler
示例#7
0
model.compile(optimizer=opt, loss=loss, metrics=[metric])

# Train and evaluate using tf.keras.Model.fit()
train_steps = num_train // 32
valid_steps = num_valid // 32

history = model.fit(Xtrain,
                    epochs=2,
                    steps_per_epoch=train_steps,
                    validation_data=Xvalid,
                    validation_steps=valid_steps)

model.save_pretrained(FINE_TUNED_MODEL_DIR)

# load saved model
saved_model = BertForSequenceClassification.from_pretrained(
    FINE_TUNED_MODEL_DIR, from_tf=True)

# predict sentence paraphrase
sentence_0 = "At least 12 people were killed in the battle last week."
sentence_1 = "At least 12 people lost their lives in last weeks fighting."
sentence_2 = "The fires burnt down the houses on the street."

inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, return_tensors="pt")
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, return_tensors="pt")

pred_1 = saved_model(**inputs_1)[0].argmax().item()
pred_2 = saved_model(**inputs_2)[0].argmax().item()


def print_result(id1, id2, pred):
    if pred == 1:
示例#8
0
文件: bert.py 项目: ghajduk3/COLI
def train_classifier(model: BertForSequenceClassification,
                     dataset: TensorDataset, validation_ratio: float,
                     batch_size: int, freeze_embeddings_layer: bool,
                     freeze_encoder_layers: int,
                     epochs: int) -> (BertForSequenceClassification, list):

    device = select_device()

    train_size = int(validation_ratio * len(dataset))
    val_size = len(dataset) - train_size

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    train_dataloader = DataLoader(train_dataset,
                                  sampler=RandomSampler(train_dataset),
                                  batch_size=batch_size)

    validation_dataloader = DataLoader(val_dataset,
                                       sampler=SequentialSampler(val_dataset),
                                       batch_size=batch_size)

    modules = []

    if freeze_embeddings_layer:
        modules.append(model.bert.embeddings)

    for i in range(freeze_encoder_layers):
        modules.append(model.bert.encoder.layer[i])

    for module in modules:
        for param in module.parameters():
            param.requires_grad = False

    model.to(device)

    optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()),
                      lr=5e-5)

    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)

    training_stats = []

    total_t0 = time.time()

    for epoch_i in range(0, epochs):

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        t0 = time.time()

        total_train_loss = 0

        model.train()

        for step, batch in enumerate(train_dataloader):

            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                    step, len(train_dataloader), elapsed))

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()

            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)

            loss = outputs.loss
            logits = outputs.logits

            total_train_loss += loss.item()

            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)

        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))

        print("")
        print("Running Validation...")

        t0 = time.time()

        model.eval()

        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        for batch in validation_dataloader:

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            with torch.no_grad():

                outputs = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask,
                                labels=b_labels)

                loss = outputs.loss
                logits = outputs.logits

            total_eval_loss += loss.item()

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.cpu().numpy()

            total_eval_accuracy += flat_accuracy(logits, label_ids)

        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

        avg_val_loss = total_eval_loss / len(validation_dataloader)

        validation_time = format_time(time.time() - t0)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        training_stats.append({
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        })

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(
        format_time(time.time() - total_t0)))

    return model, training_stats
def get_model(opt):
    #model = BertForSequenceClassification.from_pretrained('./bert-base-cased',num_labels=opt.num_labels)
    model = BertForSequenceClassification.from_pretrained(
        './bert-base-uncased', num_labels=opt.num_labels)
    return model
示例#10
0
def main(args):
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    os.makedirs(args.output_dir, exist_ok=True)
    json.dump(args.__dict__,
              open(
                  os.path.join(args.output_dir,
                               'opt_{}.json'.format(args.task_name)), 'w'),
              sort_keys=True,
              indent=2)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)

    amp_handle = None
    if args.fp16:
        from apex import amp
        amp_handle = amp.init(enable_caching=True)

    # Prepare model
    if (args.model_recover_path is None) or len(args.model_recover_path) == 0:
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model, num_labels=num_labels)
    else:
        if not os.path.exists(args.model_recover_path):
            logger.info("Path does not exist: {0}".format(
                args.model_recover_path))
            sys.exit(0)
        logger.info("***** Recover model: {0} *****".format(
            args.model_recover_path))
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model,
            state_dict=torch.load(args.model_recover_path),
            num_labels=num_labels)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps)
    if args.do_train:
        t_total = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)
    else:
        t_total = 1
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      correct_bias=False)
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if args.fp16:
        try:
            from apex.fp16_utils.fp16_optimizer import FP16_Optimizer
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    logger.info("***** CUDA.empty_cache() *****")
    torch.cuda.empty_cache()

    if args.task_name == 'sts-b':
        if args.fp16:
            lbl_type = torch.half
        else:
            lbl_type = torch.float
    else:
        lbl_type = torch.long

    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", t_total)
        train_data = convert_features_to_dataset(train_features, lbl_type)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        best_result = 0.0

        for i_epoch in trange(1, args.num_train_epochs + 1, desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            model.train()
            iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)')
            for step, batch in enumerate(iter_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                outputs = model(input_ids,
                                attention_mask=input_mask,
                                token_type_ids=segment_ids,
                                labels=label_ids)
                loss = outputs[0]
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                    if amp_handle:
                        amp_handle._clear_cache()
                else:
                    loss.backward()

                tr_loss += loss.item()
                iter_bar.set_description('Iter (loss=%5.3f)' % loss.item())
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            # Perform validation
            eval_examples = processor.get_dev_examples(args.data_dir)
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer)
            eval_data = convert_features_to_dataset(eval_features, lbl_type)
            eval_segment = processor.get_dev_segments()[0]
            logger.info("***** Running evaluation: {0}-{1} *****".format(
                eval_segment, i_epoch))
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)

            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_result = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            all_logits, all_label_ids = [], []
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    outputs = model(input_ids,
                                    attention_mask=input_mask,
                                    token_type_ids=segment_ids,
                                    labels=label_ids)
                    tmp_eval_loss = outputs[0]
                    logits = outputs[1]
                    if amp_handle:
                        amp_handle._clear_cache()

                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                all_logits.append(logits)
                all_label_ids.append(label_ids)

                eval_loss += tmp_eval_loss.mean().item()

                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps

            # compute evaluation metric
            all_logits = np.concatenate(all_logits, axis=0)
            all_label_ids = np.concatenate(all_label_ids, axis=0)
            metric_func = processor.get_metric_func()
            eval_result = metric_func(all_logits, all_label_ids)
            # logging the results
            logger.info("***** Eval results for {0}: {1} *****".format(
                eval_segment, eval_result))
            if eval_result > best_result:
                best_result = eval_result
                # Save a trained model
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(
                    args.output_dir, "{0}.pt".format(args.task_name))
                torch.save(model_to_save.state_dict(), output_model_file)
                logger.info(
                    "  Saved best model to {0}".format(output_model_file))

    # delete unused variables
    del optimizer
    del param_optimizer
    del optimizer_grouped_parameters

    # Load a trained model that you have fine-tuned
    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        logger.info("***** CUDA.empty_cache() *****")
        torch.cuda.empty_cache()
        del model

        output_model_file = os.path.join(args.output_dir,
                                         "{0}.pt".format(args.task_name))
        model_state_dict = torch.load(output_model_file)
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model,
            state_dict=model_state_dict,
            num_labels=num_labels)
        model.to(device)

        if n_gpu > 1:
            model = torch.nn.DataParallel(model)

        eval_set_list = []
        for eval_segment in processor.get_dev_segments():
            eval_examples = processor.get_dev_examples(args.data_dir,
                                                       segment=eval_segment)
            eval_set_list.append((eval_segment, eval_examples))
            break

        for eval_segment, eval_examples in eval_set_list:
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer)
            eval_data = convert_features_to_dataset(eval_features, lbl_type)
            logger.info("***** Running evaluation: %s *****", eval_segment)
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_result = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            all_logits, all_label_ids = [], []
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    outputs = model(input_ids,
                                    attention_mask=input_mask,
                                    token_type_ids=segment_ids,
                                    labels=label_ids)
                    tmp_eval_loss = outputs[0]
                    logits = outputs[1]
                    if amp_handle:
                        amp_handle._clear_cache()

                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                all_logits.append(logits)
                all_label_ids.append(label_ids)

                eval_loss += tmp_eval_loss.mean().item()

                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps

            # compute evaluation metric
            all_logits = np.concatenate(all_logits, axis=0)
            all_label_ids = np.concatenate(all_label_ids, axis=0)
            metric_func = processor.get_metric_func()
            eval_result = metric_func(all_logits, all_label_ids)
            # logging the results
            logger.info("***** Eval results for {0}: {1} *****".format(
                eval_segment, eval_result))
示例#11
0
def npoclass(inputs,
             gpu_core=True,
             model_path=None,
             ntee_type='bc',
             n_jobs=4,
             backend='multiprocessing',
             batch_size_dl=64,
             verbose=1):

    # Set the seed value all over the place to make this reproducible.
    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)

    # Check model files.
    if ntee_type == 'bc' and model_path == None:
        raise ValueError(
            "Make sure model files/path are correct. Please download from https://jima.me/open/npoclass_model_bc.zip, unzip, and specifiy model_path (default set to None)."
        )
    if ntee_type == 'mg' and model_path == None:
        raise ValueError(
            "Make sure model files/path are correct. Please download from https://jima.me/open/npoclass_model_mg.zip, unzip, and specifiy model_path (default set to None)."
        )

    # Check ntee type.
    if ntee_type == 'bc':
        le_file_name = 'le_broad_cat.pkl'
    elif ntee_type == 'mg':
        le_file_name = 'le_major_group.pkl'
    else:
        raise ValueError(
            "ntee_type must be 'bc' (broad category) or 'mg' (major group)")

    # Read model and label encoder, if not read.
    global model_loaded, tokenizer_loaded, label_encoder
    try:
        assert model_loaded
        assert tokenizer_loaded
        assert label_encoder
    except:
        #load a pretrained model and tokenizer.
        model_loaded = BertForSequenceClassification.from_pretrained(
            model_path)
        tokenizer_loaded = BertTokenizer.from_pretrained(model_path)
        # Read label encoder.
        with open(model_path + le_file_name, 'rb') as label_encoder_pkl:
            label_encoder = pickle.load(label_encoder_pkl)

    # Select acceleration method.
    if gpu_core == True and torch.cuda.is_available():
        print('There are %d GPU(s) available.' % torch.cuda.device_count(),
              'Using GPU:', torch.cuda.get_device_name(0))
        torch.cuda.manual_seed_all(seed_val)
        device = torch.device('cuda')
        model_loaded.cuda()
    else:
        print('No GPU acceleration available or gpu_core=False, using CPU.')
        device = torch.device('cpu')
        model_loaded.cpu()
    print('Encoding inputs ...')
    sleep(.5)  # Pause a second for better printing results.

    # Encode inputs.
    global func_encode_string, func_encode_string_batch  # Define as global, otherwise cannot pickle or very slow.

    def func_encode_string(text_string):
        encoded_dict = tokenizer_loaded.encode_plus(
            text_string,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            truncation='longest_first',
            padding='max_length',  # Max length accepted by model.
            return_attention_mask=True,  # Construct attn. masks.
            return_tensors='pt',  # Return pytorch tensors.
        )
        return encoded_dict

    def func_encode_string_batch(text_strings):
        encoded_dicts = []
        for text_string in text_strings:
            encoded_dicts += [func_encode_string(text_string)]
        return encoded_dicts

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []
    attention_masks = []
    # Encode input string(s).
    if type(inputs) == list:
        if backend == 'multiprocessing':  # Multiprocessing is faster than loky in processing large objects.
            encoded_outputs = Parallel(
                n_jobs=n_jobs,
                backend="multiprocessing",
                batch_size='auto',
                verbose=verbose)(delayed(func_encode_string)(text_string)
                                 for text_string in inputs)
            for encoded_output in encoded_outputs:
                # Add the encoded sentence to the list.
                input_ids.append(encoded_output['input_ids'])
                # And its attention mask (simply differentiates padding from non-padding).
                attention_masks.append(encoded_output['attention_mask'])
        elif backend == 'sequential':
            for text_string in tqdm(inputs):
                encoded_output = func_encode_string(text_string)
                # Add the encoded sentence to the list.
                input_ids.append(encoded_output['input_ids'])
                # And its attention mask (simply differentiates padding from non-padding).
                attention_masks.append(encoded_output['attention_mask'])
        elif backend == 'dask':
            with joblib.parallel_backend('dask'):
                n_jobs = len(
                    client.scheduler_info()['workers'])  # Get # works.
                string_chunks = partition_all(
                    math.ceil(len(inputs) / n_jobs),
                    inputs)  # Collect into groups of size by worker numbers.
                encoded_outputs = Parallel(
                    n_jobs=-1, batch_size='auto', verbose=verbose)(
                        delayed(func_encode_string_batch)(text_strings)
                        for text_strings in string_chunks)
                encoded_outputs = itertools.chain(*encoded_outputs)
            for encoded_output in encoded_outputs:
                # Add the encoded sentence to the list.
                input_ids.append(encoded_output['input_ids'])
                # And its attention mask (simply differentiates padding from non-padding).
                attention_masks.append(encoded_output['attention_mask'])
    if type(inputs) == str:
        encoded_output = func_encode_string(inputs)
        input_ids = [encoded_output['input_ids']]
        attention_masks = [encoded_output['attention_mask']]

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    # Prepare dataloader for efficient calculation.
    pred_data = TensorDataset(input_ids, attention_masks)
    pred_sampler = SequentialSampler(pred_data)
    pred_dataloader = DataLoader(pred_data,
                                 sampler=pred_sampler,
                                 batch_size=batch_size_dl)

    # Start prediction.
    model_loaded.eval()
    logits_all = []
    print('Predicting categories ...')
    sleep(.5)  # Pause a second for better printing results.
    for batch in tqdm(pred_dataloader, mininterval=10):
        # Add batch to the pre-chosen device
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask = batch
        with torch.no_grad():
            outputs = model_loaded(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask)
        logits_all += outputs[0].tolist()

    # Calculate probabilities of logitcs.
    logits_prob = tf.nn.sigmoid(logits_all).numpy().tolist()
    # Find the positions of max values in logits.
    logits_max = np.argmax(logits_prob, axis=1)
    # Transfer to labels.
    logits_labels = label_encoder.inverse_transform(logits_max)

    # Compile results to be returned.
    result_list = []
    for list_index in range(0, len(logits_labels)):
        result_dict = {}
        result_dict['recommended'] = logits_labels[list_index]
        conf_prob = logits_prob[list_index][logits_max[list_index]]
        if conf_prob >= .99:
            result_dict['confidence'] = 'high (>=.99)'
        elif conf_prob >= .95:
            result_dict['confidence'] = 'medium (<.99|>=.95)'
        else:
            result_dict['confidence'] = 'low (<.95)'
        prob_dict = {}
        for label_index in range(0, len(label_encoder.classes_)):
            prob_dict[label_encoder.classes_[label_index]] = logits_prob[
                list_index][label_index]
        result_dict['probabilities'] = prob_dict
        result_list += [result_dict]

    return result_list
示例#12
0
def evaluate(args):
    #set up model and device (hopefully cuda)
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    if args.cont:
        print(args.cont_path)
        model = BertForSequenceClassification.from_pretrained(args.cont_path)
        tokenizer = BertTokenizer.from_pretrained(args.cont_path)
    else:
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased')
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model.to(device)

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    h = 0
    s = 0
    n = 0
    for f in os.listdir(args.datapath):
        if f.endswith("ham"):
            h += 1
        elif f.endswith("spam"):
            s += 1
        else:
            n += 1

    print(f"{h} {s} {n}")

    TP = 0
    TN = 0
    T1 = 0
    T2 = 0

    batch_list = getBatch(args.datapath, 1, tokenizer)
    for _ in trange(len(os.listdir(args.datapath))):

        batch, labels, masks = next(batch_list)
        inputs = torch.tensor(batch, dtype=torch.long, device=device)
        labels = torch.tensor(labels, dtype=torch.long, device=device)
        masks = torch.tensor(masks, dtype=torch.long, device=device)
        inputs = inputs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        with torch.no_grad():
            model.eval()
            outputs = model(inputs)
            loss = select(outputs[0])

            # check for errors
            if labels[0] == 0:  # expect ham
                if loss == 0:
                    TP += 1
                else:
                    T2 += 1
            if labels[0] == 1:  # expect spam
                if loss == 1:
                    TN += 1
                else:
                    T1 += 1

            # print(f"expected : produced -- {labels[0]} : {loss}")
            # print("message:\n" + tokenizer.decode(inputs[0].tolist()))

    print(f"TP: {TP}\tTN:{TN}\tT1: {T1}\tT2: {T2}")
示例#13
0
def train(datapath,
          outpath,
          seed,
          batch_size,
          epochs,
          save_steps,
          args,
          use_cuda=True):
    #set up model and device (hopefully cuda)
    device = torch.device(
        "cuda" if torch.cuda.is_available() and use_cuda else "cpu")

    # if use_gpt:
    #     model = GPT2LMHeadModel.from_pretrained('gpt2')
    #     tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    # else:
    #     model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
    #     tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
    if args.cont:
        model = BertForSequenceClassification.from_pretrained(args.cont_path)
        tokenizer = BertTokenizer.from_pretrained(args.cont_path)
    else:
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased')
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(),
                                 betas=(.9, .999),
                                 lr=2e-05)

    #setup rng seeds on all devices to ensure repeatable results
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    num_batches = len(os.listdir(datapath)) / batch_size
    # batch_list = getBatch(datapath, batch_size, tokenizer)
    batch_list = spam_file_man(datapath, batch_size, tokenizer)
    next(batch_list)

    avg_losses = []
    avg_loss = 0

    model.zero_grad()
    timestamp = datetime.datetime.now().strftime('%y%m%d%H%M%S')

    for _ in trange(epochs, desc="Epochs"):
        for batch_num in tqdm(range(0 if not args.cont else args.cont_step,
                                    int(num_batches)),
                              desc="Batches"):
            #setup this batch.
            batch, labels, masks = next(batch_list)
            inputs = torch.tensor(batch, dtype=torch.long, device=device)
            labels = torch.tensor(labels, dtype=torch.long, device=device)
            masks = torch.tensor(masks, dtype=torch.long, device=device)
            inputs = inputs.to(device)
            labels = labels.to(device)
            masks = masks.to(device)

            #feed input to model to train
            model.train()
            outputs = model(input_ids=inputs,
                            labels=labels,
                            attention_mask=masks)

            # if not use_gpt:
            #     # loss returned from transfoXL was broken
            #     first_pad = get_first_occ(inputs[0], -1)
            #     loss = outputs[0][0][:first_pad].mean()

            loss = outputs[0]
            avg_loss += loss

            #update parameters
            loss.backward()
            optimizer.step()
            model.zero_grad()

            if batch_num % save_steps == 0:
                print('CHECKPOINT')
                checkpoint_path = f"{fixpath(outpath)}{timestamp}/e{epochs}-num{batch_num}-size{batch_size}"
                if not os.path.exists(checkpoint_path):
                    os.makedirs(checkpoint_path)
                model_to_save = model.module if hasattr(
                    model, 'module'
                ) else model  # Take care of distributed/parallel training
                model_to_save.save_pretrained(checkpoint_path)
                tokenizer.save_pretrained(checkpoint_path)

                avg = avg_loss / save_steps
                print(f"average loss: {avg}")
                avg_losses += [avg]
                print('finished')

    print(avg_losses)
model_path = "E:\Projects\A_Idiom_detection_gihan\idiom_detection_nlp\models\\epie_models"
model_path = "E:\Projects\A_Idiom_detection_gihan\idiom_detection_nlp\\building_emotional_embeddings\models\idiomatic_dataset_with_sentiments\checkpoint-500\\"
# model_path = 'bert-base-uncased'
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, BertModel, BertForSequenceClassification
import torch
print('hi')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = TFBertModel.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
sentence = "Hello developmentation"
tokens = tokenizer.tokenize(sentence)
print(tokens)

# input_ids = tf.constant(tokenizer.encode(sentence))[None, :]  # Batch size 1
input_ids = torch.tensor(tokenizer.encode(sentence))[None, :]  # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs  # The last hidden-state is the first element of the output tuple
print(last_hidden_states)
示例#15
0
def main(config):
    # Get pretrained tokenizer.
    tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model_name)
    # Get dataloaders using tokenizer from untokenized corpus.
    train_loader, valid_loader, index_to_label = get_loaders(
        config.train_fn, tokenizer)

    print(
        '|train| =',
        len(train_loader) * config.batch_size,
        '|valid| =',
        len(valid_loader) * config.batch_size,
    )

    # Get pretrained model with specified softmax layer.
    model = BertForSequenceClassification.from_pretrained(
        config.pretrained_model_name, num_labels=len(index_to_label))

    if config.use_radam:
        optimizer = custom_optim.RAdam(model.parameters(), lr=config.lr)
    else:
        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

        optimizer = optim.AdamW(optimizer_grouped_parameters,
                                lr=config.lr,
                                eps=config.adam_epsilon)

    # By default, model returns a hidden representation before softmax func.
    # Thus, we need to use CrossEntropyLoss, which combines LogSoftmax and NLLLoss.
    crit = nn.CrossEntropyLoss()

    n_total_iterations = len(train_loader) * config.n_epochs
    n_warmup_steps = int(n_total_iterations * config.warmup_ratio)
    scheduler = get_linear_schedule_with_warmup(optimizer, n_warmup_steps,
                                                n_total_iterations)

    if config.gpu_id >= 0:
        model.cuda(config.gpu_id)
        crit.cuda(config.gpu_id)

    # Start train.
    trainer = Trainer(config)
    model = trainer.train(
        model,
        crit,
        optimizer,
        scheduler,
        train_loader,
        valid_loader,
    )

    torch.save(
        {
            'rnn': None,
            'cnn': None,
            'bert': model.state_dict(),
            'config': config,
            'vocab': None,
            'classes': index_to_label,
            'tokenizer': tokenizer,
        }, config.model_fn)
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_dir",
        default="/home/jqu/Documents/data/XNLI/",
        type=str,
        required=False,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task.",
    )
    parser.add_argument("--model_type",
                        type=str,
                        required=True,
                        help="distilbert|bert")
    parser.add_argument("--model_dir",
                        type=str,
                        required=True,
                        help="where the trained model locates")
    args = parser.parse_args()
    # load test dataset
    processor = processors["xnli"](language="en", train_language="en")
    examples = processor.get_test_examples(args.data_dir)

    if args.model_type == "bert":
        # prepare tokenizer
        tokenizer = BertTokenizer.from_pretrained(args.model_dir,
                                                  do_lower_case=False)

        model = BertForSequenceClassification.from_pretrained(args.model_dir)

    elif args.model_type == "distilbert":
        tokenizer = DistilBertTokenizer.from_pretrained(args.model_dir,
                                                        do_lower_case=False)

        model = DistilBertForSequenceClassification.from_pretrained(
            args.model_dir)

    elif args.model_type == "albert":
        tokenizer = AlbertTokenizer.from_pretrained(args.model_dir,
                                                    do_lower_case=False)

        model = AlbertForSequenceClassification.from_pretrained(args.model_dir)

    model.to("cuda:0")
    model.eval()

    features = convert_examples_to_features(
        examples,
        tokenizer,
        label_list=processor.get_labels(),
        max_length=128,
        output_mode="classification",
        pad_on_left=False,
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=0,
        mask_padding_with_zero=True)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                      dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                      dtype=torch.long)

    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_attention_mask,
                            all_token_type_ids, all_labels)
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=512)

    overall_preds = [[], []]
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        with torch.no_grad():
            batch = tuple(t.to("cuda:0") for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[3]
            }
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if args.model_type in ["bert"] else None
                )  # XLM and DistilBERT don't use segment_ids
            outputs = model(**inputs)
            _, logits = outputs[:2]
            preds = logits.detach().cpu().numpy()
            preds = np.argmax(preds, axis=1)
            overall_preds[0] += preds.tolist()

            out_label_ids = inputs["labels"].detach().cpu().numpy()
            overall_preds[1] += out_label_ids.tolist()
    # compute scores
    result = accuracy_score(overall_preds[0], overall_preds[1])
    print(f"Overall accuracy: {result}")
    confusion_score = confusion_matrix(overall_preds[0], overall_preds[1])
    print("confusion matrix:\n")
    print(confusion_score)
示例#17
0
        labels_test,
        num_labels,
    ) = get_data(config)

    ldm = PLDataModuleFromCorpus(
        raw_train,
        labels_train,
        val=raw_dev,
        val_labels=labels_dev,
        test=raw_test,
        test_labels=labels_test,
        collate_fn=collate_fn,
        **config.data,
    )

    model = BertForSequenceClassification.from_pretrained(
        config.hugging_face_model, num_labels=num_labels)

    logger.info(model)

    # Leave this hardcoded for now.
    optimizer = AdamW([p for p in model.parameters() if p.requires_grad],
                      lr=1e-5)
    criterion = nn.CrossEntropyLoss()

    lm = BertPLModule(
        model,
        optimizer,
        criterion,
        metrics={"acc": FromLogits(pl.metrics.classification.Accuracy())},
    )
                data.self_train_prop) != 0 else [0] * len(data.self_train_prop)
    return data


if __name__ == '__main__':
    args = create_args()

    # load tokenizer
    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case,
                                           piece=args.piece,
                                           piece_model=args.piece_model)

    # load bert model
    config = BertConfig.from_json_file(args.config_file)
    model = BertForSequenceClassification(config)
    model_state_dict = model.state_dict()
    print('Model parameter: {}'.format(
        sum(p.numel() for k, p in model_state_dict.items())))
    pre_state_dict = torch.load(args.pretrained_file)
    pre_state_dict = {
        k: v
        for k, v in pre_state_dict.items() if k in model_state_dict
    }
    model_state_dict.update(pre_state_dict)
    model.load_state_dict(model_state_dict)
    if args.cuda:
        model.cuda()

    # load data
    data = BERTCLDCDataReader(args, tokenizer)
def train(trainloader, valloader, model_name, num_label, epochs):
    model = BertForSequenceClassification.from_pretrained(model_name,
                                                          num_labels=num_label)
    clear_output()

    # 讓模型跑在 GPU 上並取得訓練集的分類準確率
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("device:", device)
    model = model.to(device)
    pred, acc = get_predictions(model, trainloader, compute_acc=True)

    # 使用 Adam Optim 更新整個分類模型的參數
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    for epoch in range(epochs):

        running_loss = 0.0
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
        print('Training...')

        # 訓練模式
        model.train()

        for data in trainloader:  # trainloader is an iterator over each batch
            tokens_tensors, segments_tensors, \
            masks_tensors, labels = [t.to(device) for t in data]

            # 將參數梯度歸零
            optimizer.zero_grad()

            # forward pass
            outputs = model(input_ids=tokens_tensors,
                            token_type_ids=segments_tensors,
                            attention_mask=masks_tensors,
                            labels=labels)

            loss = outputs[0]
            # backward
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            # 紀錄當前 batch loss
            running_loss += loss.item()

        # 計算分類準確率
        logit, acc = get_predictions(model, trainloader, compute_acc=True)

        print('loss: %.3f, acc: %.3f' % (running_loss, acc))
        print("")
        print("Running Validation...")

        # # Put the model in evaluation mode--the dropout layers behave differently
        # # during evaluation.
        # model.eval()

        # # Evaluate data for one epoch
        # for data in valloader:
        #     tokens_tensors, segments_tensors, \
        #     masks_tensors, labels = [t.to(device) for t in data]

        #     # Telling the model not to compute or store gradients, saving memory and
        #     # speeding up validation
        #     with torch.no_grad():
        #         # Forward pass, calculate logit predictions.
        #         # This will return the logits rather than the loss because we have
        #         # not provided labels.
        #         # token_type_ids is the same as the "segment ids", which
        #         # differentiates sentence 1 and 2 in 2-sentence tasks.
        #         outputs = model(input_ids=tokens_tensors,
        #                     token_type_ids=segments_tensors,
        #                     attention_mask=masks_tensors,
        #                     labels=labels)

        #     # Get the "logits" output by the model. The "logits" are the output
        #     # values prior to applying an activation function like the softmax.
        #     logits = outputs[0]

        _, acc = get_predictions(model, valloader, compute_acc=True)
        # Move logits and labels to CPU
        #logits = logits.detach().cpu().numpy()
        #label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        # tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        # Accumulate the total accuracy.
        #eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        #nb_eval_steps += 1

        # Report the final accuracy for this validation run.
        print("  Accuracy: {0:.2f}".format(acc))
    return model
示例#20
0
def train(args, states=None):

    config_obj = Config(args.config_file)
    config = config_obj.elements

    # make training runs deterministic
    set_seed(seed_value=config['random_seed'])

    logging.info("Loading datasets...")
    dataset, labels = load_tokens(
        input_id_path=config['input_id'],
        token_type_id_path=config['token_type_id'],
        attention_mask_path=config['attention_mask'],
        label_path=config['labels'],
    )

    train_loader, val_loader, test_loader = create_dataloaders(
        dataset,
        labels,
        batch_size=config['batch_size'],
        random_seed=config['random_seed'],
        balance=config['correct_imbalance'],
    )

    model = BertForSequenceClassification.from_pretrained(
        "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
        num_labels=2,
        output_attentions=False,
        output_hidden_states=False,
    )

    if torch.cuda.is_available():
        model.cuda()

    loss_function = nn.CrossEntropyLoss()
    # optimizer = AdamW(model.parameters(), lr=config['lr'])
    optimizer = torch.optim.SGD(model.parameters(), lr=config['lr'])

    total_train_steps = config['num_epochs'] * len(train_loader)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_train_steps,
    )

    best_metric = 0

    # loop over the dataset multiple times
    for epoch in range(1, config['num_epochs'] + 1):
        logging.info(
            f"==================== Epoch: {epoch} ====================")
        running_losses = []
        for i, data in enumerate(train_loader, 0):
            # get the inputs; data is a list of [inputs, labels]
            input_ids, token_type_ids, attention_mask, labels = data

            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                token_type_ids = token_type_ids.cuda()
                attention_mask = attention_mask.cuda()
                labels = labels.cuda()

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward
            _, logits = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            # probs = F.softmax(logits, dim=1)

            # backprop
            loss = loss_function(logits, labels)
            loss.backward()

            # clip gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # update/optimize
            optimizer.step()
            # update learning rate
            scheduler.step()

            # Log summary
            running_losses.append(loss.item())
            if i % args.log_interval == 0:
                interval_loss = sum(running_losses) / len(running_losses)
                logging.info(f"step = {i}, loss = {interval_loss}")
                running_losses = []

            if i % args.test_interval == 0:
                dev_metric = eval(
                    val_loader,
                    model,
                    loss_function,
                    args.eval_metric,
                )
                if dev_metric > best_metric:
                    best_metric = dev_metric
                    states = {
                        "epoch": epoch,
                        "step": i,
                        "model": model.state_dict(),
                        "optimizer": optimizer.state_dict()
                    }
                    save_model_state(save_dir=args.model_dir,
                                     step=i,
                                     states=states)

    print(f"Finished Training, best {args.eval_metric}: {best_metric}")
示例#21
0
#qscd
d_lab = dict()
d_lab["questioning"] = 0
d_lab["support"] = 1
d_lab["commenting"] = 2
d_lab["denying"] = 3

train = pd.read_csv("../Fine-Tuning/CSV_Stance/train_semeval_raw.csv")
val = pd.read_csv("../Fine-Tuning/CSV_Stance/dev_semeval_raw.csv")
test = pd.read_csv("../Fine-Tuning/CSV_Stance/test_semeval_raw.csv")

#dir_path='../../model_save/Dos-Fases-all_Stance_4epochs/'
dir_path = '../../model_save/Dos-Fases-all_Stance/'
tokenizer_loaded = BertTokenizer.from_pretrained(
    dir_path)  #'bert-base-uncased')
model_loaded = BertForSequenceClassification.from_pretrained(
    dir_path, num_labels=4)  #'bert-base-uncased',  num_labels=4)

idx_2_token = tokenizer_loaded.ids_to_tokens

archivo = open(dir_path + 'vocab.txt', 'r')
Word2Index = {word.strip(): i for i, word in enumerate(archivo.readlines())}

M_BERT_space = model_loaded.bert.embeddings.word_embeddings.weight.detach(
).cpu().numpy()
transformer = Normalizer().fit(M_BERT_space)  # fit does nothing.
M_BERT_space = transformer.transform(M_BERT_space)


def my_normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0:
示例#22
0
def main():
    # training settings
    def get_args():
        parser = ArgumentParser(description='QQPairs')
        parser.add_argument('--name', type=str,
                            default='QQPairs', metavar='S',
                            help="Model name")
        parser.add_argument('--checkpoint', type=str,
                            default='bert-base-uncased', metavar='S',
                            help="e.g., bert-base-uncased, etc")
        parser.add_argument('--model', type=str,
                            default='bert-base-uncased', metavar='S',
                            help="e.g., bert-base-uncased, etc")
        parser.add_argument('--batch-size', type=int, default=32, metavar='N',
                             help='input batch size for training (default: 32)')
        parser.add_argument('--epochs', type=int, default=1, metavar='N',
                             help='number of epochs to train (default: 1)')
        parser.add_argument('--lr', type=float, default=1e-5, metavar='LR',
                             help='learning rate (default: 1e-5)')
        parser.add_argument('--seed', type=int, default=1, metavar='S',
                             help='random seed (default: 1)')
        parser.add_argument('--num-workers', type=int, default=0, metavar='N',
                             help='number of CPU cores (default: 0)')
        parser.add_argument('--num-labels', type=int, default=2, metavar='N',
                             help='number of labels to classify (default: 2)')
        parser.add_argument('--l2', type=float, default=0.01, metavar='LR',
                             help='l2 regularization weight (default: 0.01)')
        parser.add_argument('--max-seq-length', type=int, default=84, metavar='N',
                             help='max sequence length for encoding (default: 84)')
        parser.add_argument('--warmup-proportion', type=int, default=0.1, metavar='N',
                             help='Warmup proportion (default: 0.1)')
        parser.add_argument('--embed-batch-size', type=int, default=1, metavar='N',
                             help='Embedding batch size emission; (default: 1)')
        args = parser.parse_args()
        return args

    args = get_args()

    # set seeds and determinism
    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = True
    torch.cuda.amp.autocast(enabled=True)

    # set device
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # build ds
    train_ds = QQP(type='train', transform=Tokenize_Transform(args, logger))

    # build ds
    dev_ds = QQP(type='dev', transform=Tokenize_Transform(args, logger))

    # create training dataloader
    train_dataloader = DataLoader(train_ds,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=args.num_workers,
                                  drop_last=False)

    # create embed dataloader
    train_embed_dataloader = DataLoader(train_ds,
                                batch_size=args.embed_batch_size,
                                shuffle=True,
                                num_workers=args.num_workers,
                                drop_last=False)

    # create embed dataloader
    dev_embed_dataloader = DataLoader(dev_ds,
                                batch_size=args.embed_batch_size,
                                shuffle=True,
                                num_workers=args.num_workers,
                                drop_last=False)

    # load the model
    model = BertForSequenceClassification.from_pretrained(args.checkpoint,
                                                          num_labels=args.num_labels).to(device)

    # create gradient scaler for mixed precision
    scaler = GradScaler()

    # set optimizer
    param_optimizer = list(model.named_parameters())

    # exclude these from regularization
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    # give l2 regularization to any parameter that is not named after no_decay list
    # give no l2 regulariation to any bias parameter or layernorm bias/weight
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.l2},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    # set optimizer
    optimizer = AdamW(optimizer_grouped_parameters,
                              lr=args.lr,
                              correct_bias=False,
                              weight_decay=args.l2)

    num_train_optimization_steps = int(len(train_ds) / args.batch_size) * args.epochs

    scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=num_train_optimization_steps,
                                                num_warmup_steps=args.warmup_proportion * num_train_optimization_steps)

    # set epochs
    epochs = args.epochs

    # set location and make if necessary
    if args.checkpoint == 'bert-base-uncased':
        checkpoint_location = 'C:\\w266\\data\\embed_checkpoints\\'
    elif args.checkpoint == 'bert-large-uncased':
        checkpoint_location = 'C:\\w266\\data\\embed_checkpoints\\bert_large\\'
    os.makedirs(checkpoint_location, exist_ok=True)

    # train
    best_loss = np.inf
    for epoch in range(1, epochs + 1):
        train_log = train(model, train_dataloader, scaler, optimizer, scheduler, device, args)
        logs = dict(train_log)
        if logs['loss'] < best_loss:
            # torch save
            torch.save(model.state_dict(), checkpoint_location + args.name + '_epoch_{}.pt'.format(epoch))
            best_loss = logs['loss']
        show_info = f'\nEpoch: {epoch} - ' + "-".join([f' {key}: {value:.4f} ' for key, value in logs.items()])
        print(show_info)

    # now proceed to emit embeddings
    model = BertForSequenceClassification.from_pretrained(args.checkpoint,
                                                          num_labels=args.num_labels,
                                                          output_hidden_states=True).to(device)
    # load weights from 1 epoch
    model.load_state_dict(torch.load(checkpoint_location + args.name + '_epoch_1.pt'))

    # export embeddings
    emit_train_embeddings(train_embed_dataloader, train_ds, model, device, args)
    emit_dev_embeddings(dev_embed_dataloader, dev_ds, model, device, args)
示例#23
0
textCNN.to(device)

config = BertConfig.from_json_file('./dataset/bert_config.json')
config.output_hidden_states = True
model = BertModel.from_pretrained(
    './model/bert_pre58_4/pytorch_model.bin', config=config)

model.cuda()
model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3]).cuda()
model.to(device)

save_offset = 12

supreme_config = BertConfig.from_json_file('./dataset/bert_config.json')
supreme_config.num_labels = len(myDataset.cls_label_2_id)
model_ = BertForSequenceClassification(config=supreme_config)

model_.cuda()
model_ = torch.nn.DataParallel(model_, device_ids=[0, 1, 2, 3]).cuda()
model_.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam([{'params': model.parameters(), 'lr': 5e-5},
                        {'params': textCNN.parameters(), 'lr': 1e-3}], lr=1e-3, weight_decay=0.)

# %%
losses = []

num_epochs = 30
for epoch in range(num_epochs):
    train_count = 0
示例#24
0
dev_texts = [data[0] for data in dev_data]
dev_labels = [data[1] for data in dev_data]
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True)
dev_dataset = Dataset(dev_encodings, dev_labels)

test_encodings = tokenizer(test_texts, truncation=True, padding=True)
test_dataset = Dataset(test_encodings, test_labels)

# We keep the label of unlabeled data to track for accuracy of pseudo-labeling
unlabeled_texts = [data[0] for data in unlabeled_data]
unlabeled_labels = [data[1] for data in unlabeled_data]
unlabeled_encodings = tokenizer(unlabeled_texts, truncation=True, padding=True)
unlabeled_dataset = Dataset(unlabeled_encodings, unlabeled_labels)

# Build model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=config.class_num)

# Criterion & optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)  #or AdamW

# Init Trainer
trainer = Trainer(config, model, loss_function, optimizer, args.save_path,
                  dev_dataset, test_dataset)

# Initial training (supervised leraning)
trainer.initial_train(labeled_dataset)

# load checkpoint
checkpoint_path = trainer.sup_path + '/checkpoint.pt'
checkpoint = torch.load(checkpoint_path)
示例#25
0
    if( ifLIMIT ):
        X_train = X_train[:100]
        y_train = y_train[:100]
        X_test = X_test[:100]
        y_test = y_test[:100]

    with open('./module/label_preprocess.pkl' , 'rb') as input:
        label_preprocessing = pickle.load(input)

    print("create data loader...")
    train_loader = create_data_loader(X_train, y_train, batch_size_ = BATCH_SIZE)
    train_loader_1 = create_data_loader(X_train, y_train, batch_size_ = 1)
    test_loader = create_data_loader(X_test, y_test, batch_size_ = 1)

    print("create model...")
    model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, 
                                                          num_labels = len(y_train[0]))
    model.to(DEVICE)
    clear_output()

    optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

    loss_fn = nn.MSELoss(reduction='sum')

    print("start training...")

    state_of_the_art = 0
    for epoch in range(EPOCHS):
        running_loss = 0.0

        for data in train_loader:    
            input_ids, token_type_ids, attention_mask, labels = [t.to(DEVICE) for t in data]
示例#26
0
        attention_masks.append(seq_mask)

    # Convert all of our data into torch tensors, the required datatype for our model
    input_ids = torch.tensor(input_ids)
    labels = torch.tensor(labels)
    attention_masks = torch.tensor(attention_masks)

    batch_size = 128  #256
    train_data = TensorDataset(input_ids, attention_masks, labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=batch_size,
                                  num_workers=4)

    model = BertForSequenceClassification.from_pretrained(bert_name,
                                                          num_labels=len(i2l))
    model = model.cuda()

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
示例#27
0
def crossvalidation_front_back():
    parser = argparse.ArgumentParser()

    parser.add_argument("--train_data_path", required=True, type=str)
    parser.add_argument("--output_dir", required=True, type=str)
    parser.add_argument("--cro_test_data_path", type=str)

    parser.add_argument("--do_lower_case", action='store_true')
    parser.add_argument("--split_num", default=2, type=int)
    parser.add_argument("--config_file", type=str)
    parser.add_argument("--model_file", type=str)
    parser.add_argument("--eval_split", default=0.2, type=float)
    parser.add_argument("--test_split", default=0.1, type=float)
    parser.add_argument("--max_len", default=512, type=int)
    parser.add_argument("--batch_size", default=16, type=int)
    parser.add_argument("--num_epochs", default=3, type=int)
    parser.add_argument("--learning_rate", default=2e-5, type=float)
    parser.add_argument("--weight_decay", default=0.01, type=float)
    parser.add_argument("--warmup_proportion", default=0.1, type=float)
    parser.add_argument("--adam_epsilon", default=1e-8, type=float)

    args = parser.parse_args()

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    print("Setting the random seed...")
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    log_path = os.path.join(args.output_dir, "log")

    print("Reading data...")
    df_data = pd.read_csv(args.train_data_path, sep="\t")
    data = df_data['data'].tolist()
    label_set = sorted(list(set(df_data['label'].values)))
    labels = encode_labels(df_data['label'].tolist(), label_set)

    if args.cro_test_data_path is not None:
        print("Preparing the croatian test data...")
        cro_test_data, cro_test_labels = read_croatian_data(
            args.cro_test_data_path)
        cro_test_labels = encode_labels(cro_test_labels, label_set)

    print("Training model on the split number " + str(args.split_num) + "...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',
                                              do_lower_case=args.do_lower_case)
    if args.config_file is not None and args.model_file is not None:
        config = BertConfig.from_pretrained(args.config_file,
                                            num_labels=len(label_set))
        model = BertForSequenceClassification.from_pretrained(args.model_file,
                                                              config=config)
    else:
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-multilingual-cased', num_labels=len(label_set))

    test_data = data[(floor(len(data) * args.split_num * 0.1)):(
        floor(len(data) * (args.split_num + 1) * 0.1))]
    test_labels = labels[floor((len(labels) * args.split_num *
                                0.1)):floor((len(labels) *
                                             (args.split_num + 1) * 0.1))]
    train_data = data[:floor((len(data) * args.split_num * 0.1))] + data[floor(
        (len(data) * (args.split_num + 1) * 0.1)):]
    train_labels = labels[:floor((len(labels) * args.split_num *
                                  0.1))] + labels[floor((len(labels) *
                                                         (args.split_num + 1) *
                                                         0.1)):]
    train_data, eval_data, train_labels, eval_labels = train_test_split(
        train_data, train_labels, test_size=args.eval_split, random_state=42)
    print("Train label:")
    print(train_labels[0])
    print("Train data:")
    print(train_data[0])
    train_dataloader = cut_at_front_and_back(train_data, train_labels,
                                             tokenizer, args.max_len,
                                             args.batch_size)
    eval_dataloader = cut_at_front_and_back(eval_data, eval_labels, tokenizer,
                                            args.max_len, args.batch_size)
    test_dataloader = cut_at_front_and_back(test_data, test_labels, tokenizer,
                                            args.max_len, args.batch_size)
    if args.cro_test_data_path is not None:
        cro_test_dataloader = cut_at_front_and_back(cro_test_data,
                                                    cro_test_labels, tokenizer,
                                                    args.max_len,
                                                    args.batch_size)
    _, __ = bert_train(model,
                       device,
                       train_dataloader,
                       eval_dataloader,
                       args.output_dir,
                       args.num_epochs,
                       args.warmup_proportion,
                       args.weight_decay,
                       args.learning_rate,
                       args.adam_epsilon,
                       save_best=True)

    print("Testing the trained model on the current test split...")
    metrics = bert_evaluate(model, test_dataloader, device)
    with open(log_path, 'a') as f:
        f.write("Results for split nr. " + str(args.split_num) +
                " on current slo test:\n")
        f.write("Acc: " + str(metrics['accuracy']) + "\n")
        f.write("Recall: " + str(metrics['recall']) + "\n")
        f.write("Precision: " + str(metrics['precision']) + "\n")
        f.write("F1: " + str(metrics['f1']) + "\n")
        f.write("\n")

    if args.cro_test_data_path is not None:
        print("Testing the trained model on the croatian test set...")
        cro_metrics = bert_evaluate(model, cro_test_dataloader, device)
        with open(log_path, 'a') as f:
            f.write("Results for split nr. " + str(args.split_num) +
                    " on cro test set:\n")
            f.write("Acc: " + str(cro_metrics['accuracy']) + "\n")
            f.write("Recall: " + str(cro_metrics['recall']) + "\n")
            f.write("Precision: " + str(cro_metrics['precision']) + "\n")
            f.write("F1: " + str(cro_metrics['f1']) + "\n")
            f.write("\n")
    print("Done.")
示例#28
0
def train(model_dir, args):
    seed_everything(args.seed)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    print("This notebook use [%s]." % (device))

    s_dir = args.model + str(
        args.num_hidden_layers) + '-' + args.preprocess + '-epoch' + str(
            args.epochs
        ) + '-' + args.scheduler + '-' + args.tokenize + '-' + str(
            args.max_len) + '-' + str(args.seed)

    save_dir = increment_path(os.path.join(model_dir, s_dir))
    log_dir = increment_path(os.path.join('logs', s_dir))

    # load model and tokenizer
    MODEL_NAME = args.model
    if MODEL_NAME.startswith('xlm'):
        tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # set neptune
    set_neptune(save_dir, args)

    # load dataset
    dataset = load_data("/opt/ml/input/data/train/train.tsv")
    labels = dataset['label'].values

    # setting model hyperparameter
    if MODEL_NAME.startswith('xlm'):
        bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME)
    else:
        bert_config = BertConfig.from_pretrained(MODEL_NAME)

    bert_config.num_labels = args.num_labels
    bert_config.num_hidden_layers = args.num_hidden_layers

    if MODEL_NAME.startswith('xlm'):
        model = XLMRobertaForSequenceClassification.from_pretrained(
            MODEL_NAME, config=bert_config)
    else:
        model = BertForSequenceClassification.from_pretrained(
            MODEL_NAME, config=bert_config)

    if args.drop >= 0:
        model.dropout = nn.Dropout(p=args.drop)

    # preprocess dataset
    if args.preprocess != 'no':
        pre_module = getattr(import_module("preprocess"), args.preprocess)
        dataset = pre_module(dataset, model, tokenizer)

    # make dataset for pytorch.
    # train, val split

    train_dataset, val_dataset = train_test_split(dataset,
                                                  test_size=args.val_ratio,
                                                  random_state=args.seed)

    tok_module = getattr(import_module("load_data"), args.tokenize)

    train_tokenized = tok_module(train_dataset,
                                 tokenizer,
                                 max_len=args.max_len)
    val_tokenized = tok_module(val_dataset, tokenizer, max_len=args.max_len)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(
        train_tokenized, train_dataset['label'].reset_index(drop='index'))
    RE_val_dataset = RE_Dataset(val_tokenized,
                                val_dataset['label'].reset_index(drop='index'))

    model.to(device)

    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
    training_args = TrainingArguments(
        seed=args.seed,
        output_dir=save_dir,  # output directory
        save_total_limit=2,  # number of total save model.
        save_steps=args.save_steps,  # model saving step.
        num_train_epochs=args.epochs,  # total number of training epochs
        learning_rate=args.lr,  # learning_rate
        per_device_train_batch_size=args.
        batch_size,  # batch size per device during training
        per_device_eval_batch_size=16,  # batch size for evaluation
        lr_scheduler_type=args.scheduler,
        warmup_steps=args.
        warmup_steps,  # number of warmup steps for learning rate scheduler
        weight_decay=args.weight_decay,  # strength of weight decay
        logging_dir=log_dir,  # directory for storing logs
        logging_steps=100,  # log saving step.
        evaluation_strategy=
        'steps',  # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        eval_steps=100,  # evaluation step.
        dataloader_num_workers=4,
        label_smoothing_factor=args.smoothing_factor,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy')

    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=RE_train_dataset,  # training dataset
        eval_dataset=RE_val_dataset,  # evaluation dataset
        compute_metrics=compute_metrics  # define metrics function
    )

    # train model
    trainer.train()
# In[38]:

test_inputs = torch.tensor(input_ids_test)
test_labels = torch.tensor(labels_test)
test_masks = torch.tensor(attention_masks_test)

# In[39]:

test_data = TensorDataset(test_inputs, test_masks, test_labels)
#test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

# # Model and Parameters

# In[40]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=3)
#model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=3)
model = nn.DataParallel(model)
model.to(device)

# In[41]:

logging.info("Model Loaded!")

# In[42]:

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{
    'params':
    [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
示例#30
0
def train_bert_uncased(t_config, p_config, s_config):

    device = torch.device('cuda')
    seed_everything(s_config.seed)

    train = pd.read_csv('../input/train.csv').sample(
        t_config.num_to_load + t_config.valid_size, random_state=s_config.seed)
    train = prepare_train_text(train, p_config)
    train = train.fillna(0)

    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    train_processed = get_tokenized_samples(t_config.MAX_SEQUENCE_LENGTH,
                                            tokenizer, train['text_proc'])

    sequences = train_processed
    lengths = np.argmax(sequences == 0, axis=1)
    lengths[lengths == 0] = sequences.shape[1]

    MyModel = BertForSequenceClassification.from_pretrained(
        'bert-base-cased', num_labels=t_config.num_labels)
    MyModel.to(device)

    # Prepare target
    target_train = train['target'].values[:t_config.num_to_load]
    target_train_aux = train[[
        'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat'
    ]].values[:t_config.num_to_load]
    target_train_identity = train[identity_columns].values[:t_config.
                                                           num_to_load]

    # Prepare training data
    inputs_train = train_processed[:t_config.num_to_load]
    weight_train = train['weight'].values[:t_config.num_to_load]
    lengths_train = lengths[:t_config.num_to_load]

    inputs_train = torch.tensor(inputs_train, dtype=torch.int64)
    Target_train = torch.Tensor(target_train)
    Target_train_aux = torch.Tensor(target_train_aux)
    Target_train_identity = torch.Tensor(target_train_identity)
    weight_train = torch.Tensor(weight_train)
    Lengths_train = torch.tensor(lengths_train, dtype=torch.int64)

    # Prepare dataset
    train_dataset = data.TensorDataset(inputs_train, Target_train,
                                       Target_train_aux, Target_train_identity,
                                       weight_train, Lengths_train)

    ids_train = lengths_train.argsort(kind="stable")
    ids_train_new = resort_index(ids_train, t_config.num_of_bucket,
                                 s_config.seed)

    train_loader = torch.utils.data.DataLoader(data.Subset(
        train_dataset, ids_train_new),
                                               batch_size=t_config.batch_size,
                                               collate_fn=clip_to_max_len,
                                               shuffle=False)

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in list(MyModel.named_parameters())
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in list(MyModel.named_parameters())
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=t_config.learning_rate,
                         betas=[0.9, 0.999],
                         warmup=t_config.warmup,
                         t_total=t_config.num_epoch * len(train_loader) //
                         t_config.accumulation_steps)

    i = 0
    for n, p in list(MyModel.named_parameters()):
        if i < 10:
            p.requires_grad = False
        i += 1

    p = train['target'].mean()
    likelihood = np.log(p / (1 - p))
    model_bias = torch.tensor(likelihood).type(torch.float)
    MyModel.classifier.bias = nn.Parameter(model_bias.to(device))

    MyModel, optimizer = amp.initialize(MyModel,
                                        optimizer,
                                        opt_level="O1",
                                        verbosity=0)

    for epoch in range(t_config.num_epoch):
        i = 0

        print('Training start')

        optimizer.zero_grad()
        MyModel.train()
        for batch_idx, (input, target, target_aux, target_identity,
                        sample_weight) in tqdm_notebook(
                            enumerate(train_loader), total=len(train_loader)):

            y_pred = MyModel(
                input.to(device),
                attention_mask=(input > 0).to(device),
            )
            loss = F.binary_cross_entropy_with_logits(y_pred[0][:, 0],
                                                      target.to(device),
                                                      reduction='none')
            loss = (loss * sample_weight.to(device)).sum() / (
                sample_weight.sum().to(device))
            loss_aux = F.binary_cross_entropy_with_logits(
                y_pred[0][:, 1:6], target_aux.to(device),
                reduction='none').mean(axis=1)
            loss_aux = (loss_aux * sample_weight.to(device)).sum() / (
                sample_weight.sum().to(device))
            loss += loss_aux
            if t_config.num_labels == 15:
                loss_identity = F.binary_cross_entropy_with_logits(
                    y_pred[0][:, 6:],
                    target_identity.to(device),
                    reduction='none').mean(axis=1)
                loss_identity = (loss_identity * sample_weight.to(device)
                                 ).sum() / (sample_weight.sum().to(device))
                loss += loss_identity

            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()

            if (i + 1) % t_config.accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            i += 1

        torch.save(
            {
                'model_state_dict': MyModel.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, f'{t_config.PATH}_{s_config.seed}')