Exemplo n.º 1
0
def test_lr_decays(model, dataset):
    _opt = optim.SGD(model.parameters(), lr=1)
    # batch_growth_rate=1: every model update increase the batch size by 1
    opt = GeoDamp(
        model,
        dataset,
        _opt,
        dwell=1,
        initial_batch_size=4,
        dampingdelay=3,
        dampingfactor=2,
        max_batch_size=8,
    )
    data = []
    for epoch in range(1, 16 + 1):
        model, opt, meta, train_data = experiment.train(model, opt)
        data.extend(train_data)
    df = pd.DataFrame(data)
    damping_factor = df.damping / 4

    # Damping always increasing/decreasing
    assert (np.diff(df.batch_size) >= 0).all()
    assert (np.diff(df.lr_) <= 0).all()
    assert (np.diff(damping_factor) >= 0).all()

    # Make sure increases by correct amounts
    assert set(damping_factor.unique()) == {1, 2, 4, 8, 16, 32}
    assert set(df.batch_size) == {4, 8}
    assert set(df.lr_) == {1, 1/2, 1/4, 1/8, 1/16}
Exemplo n.º 2
0
def test_padadamp(model, dataset):
    _opt = optim.Adadelta(model.parameters(), lr=1)
    opt = PadaDamp(
        model, dataset, _opt, batch_growth_rate=1, initial_batch_size=4, dwell=1
    )
    data: List[Dict[str, Any]] = []
    for epoch in range(1, 16 + 1):
        model, opt, meta, train_data = experiment.train(model, opt)
        data += train_data
    df = pd.DataFrame(data)
    assert (df.damping >= 1).all()
Exemplo n.º 3
0
def test_gradient_descent(model, dataset):
    init_bs = 8
    _opt = optim.SGD(model.parameters(), lr=0.500)
    opt = GradientDescent(model, dataset, _opt)
    data: List[Dict[str, Any]] = []
    initial_loss = opt._get_loss()
    for epoch in range(5):
        model, opt, meta, train_data = experiment.train(model, opt)
        data += train_data
    df = pd.DataFrame(data)
    assert (df.batch_loss.diff().dropna() < 0).all()
    assert (df.len_dataset == df.batch_size).all()
    assert np.allclose(df.epochs.diff().dropna(), 1)
Exemplo n.º 4
0
def test_basics(model, dataset, epochs=14):
    optimizer = optim.Adadelta(model.parameters(), lr=1)
    opt = BaseDamper(model, dataset, optimizer, initial_batch_size=8)

    data: List[Dict[str, Any]] = []
    for epoch in range(1, epochs + 1):
        model, opt, meta, train_data = experiment.train(model, opt)
        data += train_data

    df = pd.DataFrame(data)
    assert (df.model_updates * df.batch_size == df.num_examples).all()
    assert df.epochs.max() <= epochs + 2
    eg_per_epoch = df.num_examples.diff().iloc[1:]
    len_dataset = df.len_dataset.iloc[1:]
    assert all((eg_per_epoch - len_dataset) <= df.batch_size.iloc[1:])
Exemplo n.º 5
0
def test_adadamp(model, dataset):
    init_bs = 8
    _opt = optim.SGD(model.parameters(), lr=0.500)
    opt = AdaDamp(model, dataset, _opt, initial_batch_size=init_bs)
    data: List[Dict[str, Any]] = []
    initial_loss = opt._get_loss()
    for epoch in range(5):
        model, opt, meta, train_data = experiment.train(model, opt)
        data += train_data
    df = pd.DataFrame(data)

    bs_hat = init_bs * df.loc[0, "_complete_loss"] / df._complete_loss
    bs_hat = bs_hat.values.astype(int) + 1
    bs = df.batch_size.values
    assert (bs == bs_hat).all()
Exemplo n.º 6
0
def test_large_batch_size(model, large_dataset):
    _opt = optim.Adadelta(model.parameters(), lr=1)
    opt = BaseDamper(model, large_dataset, _opt, initial_batch_size=1024)
    data: List[Dict[str, Any]] = []
    data2: List[Dict[str, Any]] = []
    for epoch in range(1, 16 + 1):
        model, opt, meta, _ = experiment.train(model, opt)
        data.append(opt.meta)
        data2.append(meta)
    df = pd.DataFrame(data)

    # Make sure the loss is decreasing
    assert df.batch_loss.diff().median() < -0.01
    assert df.batch_loss.diff().mean() < -0.01
    assert 2.25 < df.loc[0, "batch_loss"]
    assert df.loc[15, "batch_loss"] < 2.06
Exemplo n.º 7
0
def test_geodamp(model, dataset):
    _opt = optim.Adadelta(model.parameters(), lr=1)
    opt = GeoDamp(model,
                  dataset,
                  _opt,
                  initial_batch_size=1,
                  dampingdelay=4,
                  dampingfactor=2)
    data: List[Dict[str, Any]] = []
    # Specifically let GeoDamp train for at least one epoch
    for epoch in range(1, 16 + 1):
        model, opt, meta, _ = experiment.train(model, opt)
        data.append(opt.meta)
    df = pd.DataFrame(data)
    # Check to make sure it's exactly one epoch
    assert np.allclose(df.epochs, np.floor(df.epochs))
    counts = df.damping.value_counts()
    assert set(counts.index.astype(int)) == {1, 2, 4, 8}
    assert np.allclose(counts.unique(), 4)
Exemplo n.º 8
0
def test_dwell_init_geo_increase(model, dataset):
    dwell = 512
    _opt = optim.Adagrad(model.parameters(), lr=1)
    # batch_growth_rate=1: every model update increase the batch size by 1
    opt = PadaDamp(
        model, dataset, _opt, dwell=dwell, initial_batch_size=4, batch_growth_rate=1
    )
    data = []
    for epoch in range(1, 16 + 1):
        model, opt, meta, train_data = experiment.train(model, opt)
        data.extend(train_data)
    df = pd.DataFrame(data)
    cbs = np.arange(64) + 1  # cnts batch size
    dbs = [[cbs[2 ** i]] * 2 ** i for i in range(4)]  # discrete bs
    dbs = sum(dbs, [])
    assert len(dbs) == 15
    # Because of exponential increase initially for geodamp
    assert (df.batch_size.iloc[1 : 1 + len(dbs)] <= np.array(dbs)).all()
    dbs = [[cbs[2**i]] * 2**i for i in range(4)]  # discrete bs
    dbs = sum(dbs, [])
    assert len(dbs) == 15
    assert (df.batch_size.iloc[1:1 + len(dbs)] <= np.array(dbs)).all()
Exemplo n.º 9
0
def test_avg_loss(model, dataset):
    """
    Test that BaseDamper._get_loss returns mean loss regardless of how many
    points are sampled.
    """
    _opt = optim.Adadelta(model.parameters(), lr=1)
    opt = BaseDamper(model, dataset, _opt)
    for epoch in range(1, 16 + 1):
        model, opt, meta, _ = experiment.train(model, opt)
    loss = [{
        "loss": opt._get_loss(frac=frac),
        "frac": frac,
        "repeat": repeat
    } for frac in np.linspace(0.5, 0.99, num=5) for repeat in range(5)]
    total_loss = opt._get_loss(frac=1)
    df = pd.DataFrame(loss)
    summary = df.pivot(index="frac", columns="repeat", values="loss")

    abs_error = np.abs(df.loss - total_loss)
    rel_error = abs_error / total_loss
    assert rel_error.max() <= 0.125
    assert np.percentile(rel_error, 50) <= 0.12
    assert 1.5 <= total_loss <= 2.2
    assert abs_error.max() <= 0.17
Exemplo n.º 10
0
def test_dwell(dwell, model, dataset):
    _opt = optim.Adadelta(model.parameters(), lr=1)
    # batch_growth_rate=1: every model update increase the batch size by 1
    opt = PadaDamp(
        model, dataset, _opt, dwell=dwell, initial_batch_size=4, batch_growth_rate=1
    )
    data = []
    for epoch in range(1, 16 + 1):
        model, opt, meta, train_data = experiment.train(model, opt)
        data.extend(train_data)
    df = pd.DataFrame(data)

    # Because geometric delay... (tested below)
    damping = df.damping.iloc[dwell:]

    chunks = [
        damping.iloc[dwell * k : dwell * (k + 1)].values
        for k in range(len(df) // dwell)
    ]
    chunks = [c for c in chunks if len(c)]
    if dwell > 1:
        assert all(np.allclose(np.diff(c), 0) for c in chunks[1:])
    else:
        assert all(len(c) <= 1 for c in chunks)
Exemplo n.º 11
0
def test_main():
    from adadamp.experiment import train, test

    # Training settings
    args = SimpleNamespace(
        batch_size=1024,
        epochs=2,
        log_interval=10,
        lr=0.1,
        no_cuda=False,
        save_model=False,
        seed=1,
        test_batch_size=1000,
    )

    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    train_set = datasets.MNIST(
        "../data",
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ]),
    )
    test_set = datasets.MNIST(
        "../data",
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ]),
    )

    # Only for making tests run faster
    dataset, _ = torch.utils.data.random_split(
        train_set, [2000, len(train_set) - 2000])
    train_set, test_set = torch.utils.data.random_split(dataset, [1000, 1000])

    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

    model = Net().to(device)
    _optimizer = optim.SGD(model.parameters(), lr=args.lr)
    loss = F.nll_loss
    optimizer = PadaDamp(
        model=model,
        dataset=train_set,
        opt=_optimizer,
        loss=loss,
        device="cpu",
        batch_growth_rate=0.1,
        initial_batch_size=32,
        max_batch_size=1024,
    )

    print("Starting...")
    for epoch in range(1, args.epochs + 1):
        train(model=model, opt=optimizer, verbose=10)
        data = test(model=model, loss=loss, dataset=test_set)
        print(data)

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")