def main(args):
    if not os.path.isdir('CMDs'):
        os.mkdir('CMDs')
    with open('CMDs/train.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

    # Set the seed value all over the place to make this reproducible.
    seed_val = args.seed
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    # Choose device
    device = get_default_device()

    with open(args.train_data_path) as f:
        train_data = json.load(f)

    electra_base = "google/electra-base-discriminator"
    electra_large = "google/electra-large-discriminator"
    tokenizer = ElectraTokenizer.from_pretrained(electra_large,
                                                 do_lower_case=True)

    labels = []
    input_ids = []
    token_type_ids = []
    count = 0

    for item in train_data:
        context = item["context"]
        question = item["question"]
        lab = item["label"]
        if lab == 3:
            # Remove unanswerable examples at training time
            continue
        labels.append(lab)
        three_inp_ids = []
        three_tok_type_ids = []
        three_answer_options = item["answers"][:3]
        for i, ans in enumerate(three_answer_options):
            combo = context + " [SEP] " + question + " " + ans
            inp_ids = tokenizer.encode(combo)
            tok_type_ids = [
                0 if i <= inp_ids.index(102) else 1
                for i in range(len(inp_ids))
            ]
            three_inp_ids.append(inp_ids)
            three_tok_type_ids.append(tok_type_ids)
        three_inp_ids = pad_sequences(three_inp_ids,
                                      maxlen=MAXLEN,
                                      dtype="long",
                                      value=0,
                                      truncating="post",
                                      padding="post")
        three_tok_type_ids = pad_sequences(three_tok_type_ids,
                                           maxlen=MAXLEN,
                                           dtype="long",
                                           value=0,
                                           truncating="post",
                                           padding="post")
        input_ids.append(three_inp_ids)
        token_type_ids.append(three_tok_type_ids)
    # Create attention masks
    attention_masks = []
    for sen in input_ids:
        sen_attention_masks = []
        for opt in sen:
            att_mask = [int(token_id > 0) for token_id in opt]
            sen_attention_masks.append(att_mask)
        attention_masks.append(sen_attention_masks)
    # Convert to torch tensors
    labels = torch.tensor(labels)
    labels = labels.long().to(device)
    input_ids = torch.tensor(input_ids)
    input_ids = input_ids.long().to(device)
    token_type_ids = torch.tensor(token_type_ids)
    token_type_ids = token_type_ids.long().to(device)
    attention_masks = torch.tensor(attention_masks)
    attention_masks = attention_masks.long().to(device)

    # Create the DataLoader for training set.
    train_data = TensorDataset(input_ids, token_type_ids, attention_masks,
                               labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size)

    model = ElectraForMultipleChoice.from_pretrained(electra_large).to(device)

    optimizer = AdamW(model.parameters(),
                      lr=args.learning_rate,
                      eps=args.adam_epsilon
                      # weight_decay = 0.01
                      )

    loss_values = []

    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(train_dataloader) * args.n_epochs
    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0.1 *
                                                total_steps,
                                                num_training_steps=total_steps)

    for epoch in range(args.n_epochs):
        # Perform one full pass over the training set.
        print("")
        print('======== Epoch {:} / {:} ========'.format(
            epoch + 1, args.n_epochs))
        print('Training...')
        # Measure how long the training epoch takes.
        t0 = time.time()
        # Reset the total loss for this epoch.
        total_loss = 0
        model.train()
        model.zero_grad()
        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)
                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                    step, len(train_dataloader), elapsed))
            b_input_ids = batch[0].to(device)
            b_tok_typ_ids = batch[1].to(device)
            b_att_msks = batch[2].to(device)
            b_labs = batch[3].to(device)
            model.zero_grad()
            outputs = model(input_ids=b_input_ids,
                            attention_mask=b_att_msks,
                            token_type_ids=b_tok_typ_ids,
                            labels=b_labs)
            loss = outputs[0]
            total_loss += loss.item()
            print(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # Update the learning rate.
            scheduler.step()
            # model.zero_grad()
        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)

        # Store the loss value for plotting the learning curve.
        loss_values.append(avg_train_loss)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(format_time(time.time() -
                                                              t0)))

    # Save the model to a file
    file_path = args.save_path + 'electra_QA_MC_seed' + str(args.seed) + '.pt'
    torch.save(model, file_path)
Exemplo n.º 2
0
def main(args):
    if not os.path.isdir('CMDs'):
        os.mkdir('CMDs')
    with open('CMDs/train.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

    # Set the seed value all over the place to make this reproducible.
    seed_val = args.seed
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    # Choose device
    device = get_default_device()

    with open(args.train_data_path + "middle.json") as f:
        middle_data = json.load(f)
    with open(args.train_data_path + "high.json") as f:
        high_data = json.load(f)
    train_data = middle_data + high_data

    electra_base = "google/electra-base-discriminator"
    electra_large = "google/electra-large-discriminator"
    tokenizer = ElectraTokenizer.from_pretrained(electra_large,
                                                 do_lower_case=True)

    def asNum(x):
        if x == "A":
            return 0
        if x == "B":
            return 1
        if x == "C":
            return 2
        if x == "D":
            return 3

    labels = []
    input_ids = []
    token_type_ids = []
    count = 0

    for item in train_data:
        context = item["article"]
        questions = item["questions"]
        answers = item["answers"]
        options = item["options"]
        for qu_num in range(len(questions)):
            lab = asNum(answers[qu_num])
            labels.append(lab)
            four_inp_ids = []
            four_tok_type_ids = []
            question = questions[qu_num]
            opts = options[qu_num]
            for opt in opts:
                combo = context + " [SEP] " + question + " " + opt
                inp_ids = tokenizer.encode(combo)
                if len(inp_ids) > 512:
                    inp_ids = inp_ids[-512:]
                tok_type_ids = [
                    0 if i <= inp_ids.index(102) else 1
                    for i in range(len(inp_ids))
                ]
                four_inp_ids.append(inp_ids)
                four_tok_type_ids.append(tok_type_ids)
            four_inp_ids = pad_sequences(four_inp_ids,
                                         maxlen=MAXLEN,
                                         dtype="long",
                                         value=0,
                                         truncating="post",
                                         padding="post")
            four_tok_type_ids = pad_sequences(four_tok_type_ids,
                                              maxlen=MAXLEN,
                                              dtype="long",
                                              value=0,
                                              truncating="post",
                                              padding="post")
            input_ids.append(four_inp_ids)
            token_type_ids.append(four_tok_type_ids)

    # Create attention masks
    attention_masks = []
    for sen in input_ids:
        sen_attention_masks = []
        for opt in sen:
            att_mask = [int(token_id > 0) for token_id in opt]
            sen_attention_masks.append(att_mask)
        attention_masks.append(sen_attention_masks)
    # Convert to torch tensors
    labels = torch.tensor(labels)
    labels = labels.long().to(device)
    input_ids = torch.tensor(input_ids)
    input_ids = input_ids.long().to(device)
    token_type_ids = torch.tensor(token_type_ids)
    token_type_ids = token_type_ids.long().to(device)
    attention_masks = torch.tensor(attention_masks)
    attention_masks = attention_masks.long().to(device)

    # Create the DataLoader for training set.
    train_data = TensorDataset(input_ids, token_type_ids, attention_masks,
                               labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size)

    model = ElectraForMultipleChoice.from_pretrained(electra_large).to(device)

    optimizer = AdamW(model.parameters(),
                      lr=args.learning_rate,
                      eps=args.adam_epsilon
                      # weight_decay = 0.01
                      )

    loss_values = []

    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(train_dataloader) * args.n_epochs
    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0.1 *
                                                total_steps,
                                                num_training_steps=total_steps)

    for epoch in range(args.n_epochs):
        # Perform one full pass over the training set.
        print("")
        print('======== Epoch {:} / {:} ========'.format(
            epoch + 1, args.n_epochs))
        print('Training...')
        # Measure how long the training epoch takes.
        t0 = time.time()
        # Reset the total loss for this epoch.
        total_loss = 0
        model.train()
        model.zero_grad()
        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)
                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                    step, len(train_dataloader), elapsed))
            b_input_ids = batch[0].to(device)
            b_tok_typ_ids = batch[1].to(device)
            b_att_msks = batch[2].to(device)
            b_labs = batch[3].to(device)
            model.zero_grad()
            outputs = model(input_ids=b_input_ids,
                            attention_mask=b_att_msks,
                            token_type_ids=b_tok_typ_ids,
                            labels=b_labs)
            loss = outputs[0]
            total_loss += loss.item()
            print(loss.item())
            optimizer.zero_grad()
            loss.backward()
            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # if (step+1) % accumulation_steps == 0:
            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()
            # Update the learning rate.
            scheduler.step()
            # model.zero_grad()
        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)

        # Store the loss value for plotting the learning curve.
        loss_values.append(avg_train_loss)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(format_time(time.time() -
                                                              t0)))

    # Save the model to a file
    file_path = args.save_path + 'electra_QA_MC_seed' + str(args.seed) + '.pt'
    torch.save(model, file_path)