示例#1
0
def train(args, data):
    device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
    model = BiDAF(args, data.WORD.vocab.vectors).to(device)

    ema = EMA(args.exp_decay_rate)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adadelta(parameters, lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    model.train()
    loss, last_epoch = 0, -1
    max_dev_exact, max_dev_f1 = -1, -1

    iterator = data.train_iter
    for i, batch in enumerate(iterator):
        present_epoch = int(iterator.epoch)
        if present_epoch == args.epoch:
            break
        if present_epoch > last_epoch:
            print('epoch:', present_epoch + 1)
        last_epoch = present_epoch

        p1, p2 = model(batch)

        optimizer.zero_grad()
        batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
        loss += batch_loss.item()
        batch_loss.backward()
        optimizer.step()

        for name, param in model.named_parameters():
            if param.requires_grad:
                ema.update(name, param.data)

        if (i + 1) % args.print_freq == 0:
            dev_loss, dev_exact, dev_f1 = test(model, ema, args, data)
            c = (i + 1) // args.print_freq

            print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}'
                  f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}')

            if dev_f1 > max_dev_f1:
                max_dev_f1 = dev_f1
                max_dev_exact = dev_exact
                best_model = copy.deepcopy(model)

            loss = 0
            model.train()

    print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}')
    return best_model
示例#2
0
def train(args, data):
    device = torch.device(
        "cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu")
    model = BiDAF(args, data.WORD.vocab.vectors).to(device)

    ema = EMA(args.exp_decay_rate)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adadelta(parameters, lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    writer = SummaryWriter(log_dir='runs/' + args.model_time)

    model.train()
    loss, last_epoch = 0, -1
    max_dev_exact, max_dev_f1 = -1, -1

    iterator = data.train_iter
    for i, batch in enumerate(iterator):
        present_epoch = int(iterator.epoch)
        if present_epoch == args.epoch:
            break
        if present_epoch > last_epoch:
            print('epoch:', present_epoch + 1)
        last_epoch = present_epoch

        p1, p2 = model(batch)

        optimizer.zero_grad()
        batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
        loss += batch_loss.item()
        batch_loss.backward()
        optimizer.step()

        for name, param in model.named_parameters():
            if param.requires_grad:
                ema.update(name, param.data)

        if (i + 1) % args.print_freq == 0:
            dev_loss, dev_exact, dev_f1 = test(model, ema, args, data)
            c = (i + 1) // args.print_freq

            writer.add_scalar('loss/train', loss, c)
            writer.add_scalar('loss/dev', dev_loss, c)
            writer.add_scalar('exact_match/dev', dev_exact, c)
            writer.add_scalar('f1/dev', dev_f1, c)
            print('train loss: {} / dev loss: {}'.format(loss, dev_loss) +
                  ' / dev EM: {} / dev F1: {}'.format(dev_exact, dev_f1))

            if dev_f1 > max_dev_f1:
                max_dev_f1 = dev_f1
                max_dev_exact = dev_exact
                best_model = copy.deepcopy(model)

            loss = 0
            model.train()

    writer.close()
    print('max dev EM: {} / max dev F1: {}'.format(max_dev_exact, max_dev_f1))

    return best_model
示例#3
0
def train(args, data):
    if args.load_model != "":
        model = BiDAF(args, data.WORD.vocab.vectors)
        model.load_state_dict(torch.load(args.load_model))
    else:
        model = BiDAF(args, data.WORD.vocab.vectors)
    device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    ema = EMA(args.exp_decay_rate)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)
    for name, i in model.named_parameters():
        if not i.is_leaf:
            print(name,i)

    writer = SummaryWriter(log_dir='runs/' + args.model_name)
    best_model = None

    for iterator, dev_iter, dev_file_name, index, print_freq, lr in zip(data.train_iter, data.dev_iter, args.dev_files, range(len(data.train)), args.print_freq, args.learning_rate):
        # print
        # (iterator[0])
        embed()
        exit(0)
        optimizer = optim.Adadelta(model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()
        model.train()
        loss, last_epoch = 0, 0
        max_dev_exact, max_dev_f1 = -1, -1
        print(f"Training with {dev_file_name}")
        print()
        for i, batch in tqdm(enumerate(iterator), total=len(iterator) * args.epoch[index], ncols=100):
            present_epoch = int(iterator.epoch)
            eva = False
            if present_epoch == args.epoch[index]:
                break
            if present_epoch > last_epoch:
                print('epoch:', present_epoch + 1)
                eva = True
            last_epoch = present_epoch

            p1, p2 = model(batch)

            optimizer.zero_grad()
            batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
            loss += batch_loss.item()
            batch_loss.backward()
            optimizer.step()

            for name, param in model.named_parameters():
                if param.requires_grad:
                    ema.update(name, param.data)

            torch.cuda.empty_cache()
            if (i + 1) % print_freq == 0 or eva:
                dev_loss, dev_exact, dev_f1 = test(model, ema, args, data, dev_iter, dev_file_name)
                c = (i + 1) // print_freq

                writer.add_scalar('loss/train', loss, c)
                writer.add_scalar('loss/dev', dev_loss, c)
                writer.add_scalar('exact_match/dev', dev_exact, c)
                writer.add_scalar('f1/dev', dev_f1, c)
                print()
                print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}'
                      f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}')

                if dev_f1 > max_dev_f1:
                    max_dev_f1 = dev_f1
                    max_dev_exact = dev_exact
                    best_model = copy.deepcopy(model)

                loss = 0
                model.train()

    writer.close()
    print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}')
    print("testing with test batch on best model")
    test_loss, test_exact, test_f1 = test(best_model, ema, args, data, list(data.test_iter)[-1], args.test_files[-1])

    print(f'test loss: {test_loss:.3f}'
          f' / test EM: {test_exact:.3f} / test F1: {test_f1:.3f}')
    return best_model
示例#4
0
def train(args, data):
    device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
    model = BiDAF(args, data.CONTEXT_WORD.vocab.vectors).to(device)
    
    num = count_parameters(model)
    print(f'paramter {num}')

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model)

    ema = EMA(args.exp_decay_rate)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adadelta(parameters, lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    writer = SummaryWriter(log_dir='runs/' + args.model_time)

    model.train()
    loss, last_epoch = 0, -1
    max_dev_exact, max_dev_f1 = -1, -1
    print('totally {} epoch'.format(args.epoch))
    
    sys.stdout.flush()
    iterator = data.train_iter
    iterator.repeat = True
    for i, batch in enumerate(iterator):

        present_epoch = int(iterator.epoch)
        if present_epoch == args.epoch:
            print('present_epoch value:',present_epoch)
            break
        if present_epoch > last_epoch:
            print('epoch:', present_epoch + 1)
        last_epoch = present_epoch

        p1, p2 = model(batch.c_char,batch.q_char,batch.c_word[0],batch.q_word[0],batch.c_word[1],batch.q_word[1])
        optimizer.zero_grad()
        batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
        loss += batch_loss.item()
        batch_loss.backward()
        optimizer.step()

        for name, param in model.named_parameters():
            if param.requires_grad:
                ema.update(name, param.data)

        if (i + 1) % args.print_freq == 0:
            dev_loss, dev_exact, dev_f1, dev_hasans_exact, dev_hasans_f1, dev_noans_exact,dev_noans_f1 = test(model, ema, args, data)
            c = (i + 1) // args.print_freq

            writer.add_scalar('loss/train', loss, c)
            writer.add_scalar('loss/dev', dev_loss, c)
            writer.add_scalar('exact_match/dev', dev_exact, c)
            writer.add_scalar('f1/dev', dev_f1, c)
            print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}'
                  f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}'
                  f' / dev hasans EM: {dev_hasans_exact} / dev hasans F1: {dev_hasans_f1}'
                  f' / dev noans EM: {dev_noans_exact} / dev noans F1: {dev_noans_f1}')

            if dev_f1 > max_dev_f1:
                max_dev_f1 = dev_f1
                max_dev_exact = dev_exact
                best_model = copy.deepcopy(model)

            loss = 0
            model.train() 
        sys.stdout.flush()
    writer.close()
    args.max_f1 = max_dev_f1
    print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}')

    return best_model
示例#5
0
def train(args, data):
    device = torch.device(
        f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
    model = BiDAF(args, data.WORD.vocab.vectors).to(device)

    ema = EMA(args.exp_decay_rate)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adadelta(parameters, lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    writer = SummaryWriter(log_dir='runs/' + args.model_time)

    model.train()
    loss, last_epoch = 0, -1
    max_dev_exact, max_dev_f1 = -1, -1

    iterator = data.train_iter
    num_batch = len(iterator)
    for present_epoch in range(args.epoch):
        print('epoch', present_epoch + 1)
        for i, batch in enumerate(iterator):
            # present_epoch = int(iterator.epoch)
            """
            if present_epoch == args.epoch:
                print(present_epoch)
                print()
                print(args.epoch)
                break
            if present_epoch > last_epoch:
                print('epoch:', present_epoch + 1)
            last_epoch = present_epoch
            """

            p1, p2 = model(batch)

            optimizer.zero_grad()
            """
            print(p1)
            print()
            print(batch.s_idx)
            """

            if len(p1.size()) == 1:
                p1 = p1.reshape(1, -1)
            if len(p2.size()) == 1:
                p2 = p2.reshape(1, -1)
            batch_loss = criterion(p1, batch.s_idx) + criterion(
                p2, batch.e_idx)
            loss += batch_loss.item()
            batch_loss.backward()
            optimizer.step()

            for name, param in model.named_parameters():
                if param.requires_grad:
                    ema.update(name, param.data)

            best_model = copy.deepcopy(model)

            if i + 1 == num_batch:
                dev_loss, dev_exact, dev_f1 = test(model, ema, args, data)
                c = (i + 1) // args.print_freq

                writer.add_scalar('loss/train', loss / num_batch, c)
                writer.add_scalar('loss/dev', dev_loss, c)
                writer.add_scalar('exact_match/dev', dev_exact, c)
                writer.add_scalar('f1/dev', dev_f1, c)
                print(
                    f'train loss: {loss/num_batch:.3f} / dev loss: {dev_loss:.3f}'
                    f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}')

                if dev_f1 > max_dev_f1:
                    max_dev_f1 = dev_f1
                    max_dev_exact = dev_exact
                    best_model = copy.deepcopy(model)

                loss = 0
                model.train()

    writer.close()
    print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}')

    return best_model
示例#6
0
def train(args, data):
    device = torch.device(
        f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
    model = BiDAF(args).to(device)

    D_batch = args.train_batch_size
    ema = EMA(args.exp_decay_rate)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adadelta(parameters, lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    # writer = SummaryWriter(log_dir='runs/' + args.model_time)

    model.train()
    loss, last_epoch = 0, -1
    max_dev_exact, max_dev_f1 = -1, -1
    i = 0
    # iterator = data.train_iter
    while i + D_batch < len(data.data):
        b_id = i
        e_id = i + D_batch
        # present_epoch = int(iterator.epoch)
        # if present_epoch == args.epoch:
        #     break
        # if present_epoch > last_epoch:
        #     print('epoch:', present_epoch + 1)
        # last_epoch = present_epoch

        p1, p2 = model(data, b_id, e_id)

        optimizer.zero_grad()
        s_idx, e_idx = data.get_targ(b_id, e_id)
        batch_loss = criterion(p1, s_idx) + criterion(p2, e_idx)
        loss += batch_loss.item()
        batch_loss.backward()
        optimizer.step()

        for name, param in model.named_parameters():
            if param.requires_grad:
                ema.update(name, param.data)

        # if (i + 1) % args.print_freq == 0:
        #     dev_loss, dev_exact, dev_f1 = test(model, ema, args, data)
        #     c = (i + 1) // args.print_freq

        #     # writer.add_scalar('loss/train', loss, c)
        #     # writer.add_scalar('loss/dev', dev_loss, c)
        #     # writer.add_scalar('exact_match/dev', dev_exact, c)
        #     # writer.add_scalar('f1/dev', dev_f1, c)
        #     # print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}'
        #     #       f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}')

        #     if dev_f1 > max_dev_f1:
        #         max_dev_f1 = dev_f1
        #         max_dev_exact = dev_exact
        #         best_model = copy.deepcopy(model)

        #     loss = 0
        #     model.train()

        i += D_batch

    # writer.close()
    print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}')

    return best_model
示例#7
0
def train(args, data):
    device = torch.device("cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu")
    model = BiDAF(args).to(device)

    # 如果载入的这些参数中,有些参数不要求被更新,即固定不变,不参与训练,需要手动设置这些参数的梯度属性为Fasle,
    # 并且在optimizer传参时筛选掉这些参数:
    # ema = EMA(args.exp_decay_rate)
    # for name, param in model.named_parameters():
    #     if param.requires_grad:
    #         ema.register(name, param.data)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adam(parameters, lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    writer = SummaryWriter(logdir='runs/' + args.model_time)

    model.train()
    loss, last_epoch = 0, -1
    max_dev_exact, max_dev_f1 = -1, -1

    iterator = data.train_iter
    for i, batch in enumerate(iterator):
        present_epoch = int(iterator.epoch)
        if present_epoch == args.epoch:
            break
        if present_epoch > last_epoch:
            print('epoch:', present_epoch + 1)
        last_epoch = present_epoch
        try:
            p1, p2 = model(batch)
        except OSError:
            pass
        optimizer.zero_grad()
        batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
        loss = batch_loss.item()
        batch_loss.backward()
        optimizer.step()
        print("loss", loss)
        # for name, param in model.named_parameters():
        #     if param.requires_grad:
        #         ema.update(name, param.data)
    #     if (i + 1) % args.print_freq == 0:
    #         dev_loss, dev_exact, dev_f1 = test(model, args, data)
    #         c = (i + 1) // args.print_freq

    #         writer.add_scalar('loss/train', loss, c)
    #         writer.add_scalar('loss/dev', dev_loss, c)
    #         writer.add_scalar('exact_match/dev', dev_exact, c)
    #         writer.add_scalar('f1/dev', dev_f1, c)
    #         print('train loss: {:.3f} / dev loss: {:.3f} dev EM: {:.3f} dev F1: {:.3f}'.format(loss, dev_loss, dev_exact, dev_f1))

    #         if dev_f1 > max_dev_f1:
    #             max_dev_f1 = dev_f1
    #             max_dev_exact = dev_exact
    #             best_model = copy.deepcopy(model)

    #         loss = 0
    #         model.train()

    # writer.close()
    # print('max dev EM: {:.3f} / max dev F1: {:.3f}'.format(max_dev_exact, max_dev_f1))

    return best_model