def test_padadamp(model, dataset): _opt = optim.Adadelta(model.parameters(), lr=1) opt = PadaDamp( model, dataset, _opt, batch_growth_rate=1, initial_batch_size=4, dwell=1 ) data: List[Dict[str, Any]] = [] for epoch in range(1, 16 + 1): model, opt, meta, train_data = experiment.train(model, opt) data += train_data df = pd.DataFrame(data) assert (df.damping >= 1).all()
def test_dwell_init_geo_increase(model, dataset): dwell = 512 _opt = optim.Adagrad(model.parameters(), lr=1) # batch_growth_rate=1: every model update increase the batch size by 1 opt = PadaDamp( model, dataset, _opt, dwell=dwell, initial_batch_size=4, batch_growth_rate=1 ) data = [] for epoch in range(1, 16 + 1): model, opt, meta, train_data = experiment.train(model, opt) data.extend(train_data) df = pd.DataFrame(data) cbs = np.arange(64) + 1 # cnts batch size dbs = [[cbs[2 ** i]] * 2 ** i for i in range(4)] # discrete bs dbs = sum(dbs, []) assert len(dbs) == 15 # Because of exponential increase initially for geodamp assert (df.batch_size.iloc[1 : 1 + len(dbs)] <= np.array(dbs)).all() dbs = [[cbs[2**i]] * 2**i for i in range(4)] # discrete bs dbs = sum(dbs, []) assert len(dbs) == 15 assert (df.batch_size.iloc[1:1 + len(dbs)] <= np.array(dbs)).all()
def test_dwell(dwell, model, dataset): _opt = optim.Adadelta(model.parameters(), lr=1) # batch_growth_rate=1: every model update increase the batch size by 1 opt = PadaDamp( model, dataset, _opt, dwell=dwell, initial_batch_size=4, batch_growth_rate=1 ) data = [] for epoch in range(1, 16 + 1): model, opt, meta, train_data = experiment.train(model, opt) data.extend(train_data) df = pd.DataFrame(data) # Because geometric delay... (tested below) damping = df.damping.iloc[dwell:] chunks = [ damping.iloc[dwell * k : dwell * (k + 1)].values for k in range(len(df) // dwell) ] chunks = [c for c in chunks if len(c)] if dwell > 1: assert all(np.allclose(np.diff(c), 0) for c in chunks[1:]) else: assert all(len(c) <= 1 for c in chunks)
def test_main(): from adadamp.experiment import train, test # Training settings args = SimpleNamespace( batch_size=1024, epochs=2, log_interval=10, lr=0.1, no_cuda=False, save_model=False, seed=1, test_batch_size=1000, ) use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") train_set = datasets.MNIST( "../data", train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ) test_set = datasets.MNIST( "../data", train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ) # Only for making tests run faster dataset, _ = torch.utils.data.random_split( train_set, [2000, len(train_set) - 2000]) train_set, test_set = torch.utils.data.random_split(dataset, [1000, 1000]) kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} model = Net().to(device) _optimizer = optim.SGD(model.parameters(), lr=args.lr) loss = F.nll_loss optimizer = PadaDamp( model=model, dataset=train_set, opt=_optimizer, loss=loss, device="cpu", batch_growth_rate=0.1, initial_batch_size=32, max_batch_size=1024, ) print("Starting...") for epoch in range(1, args.epochs + 1): train(model=model, opt=optimizer, verbose=10) data = test(model=model, loss=loss, dataset=test_set) print(data) if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
def main( dataset: str = "fashionmnist", initial_batch_size: int = 64, epochs: int = 6, verbose: Union[int, bool] = False, lr: float = 1.0, cuda: bool = False, random_state: Optional[int] = None, # seed to pass to BaseDamper init_seed: Optional[int] = None, # seed for initialization tuning: bool = True, # tuning seed damper: str = "geodamp", batch_growth_rate: float = 0.01, dampingfactor: Number = 5.0, dampingdelay: int = 5, max_batch_size: Optional[int] = None, test_freq: float = 1, approx_loss: bool = False, rho: float = 0.9, dwell: int = 1, approx_rate: bool = False, model: Optional[str] = None, momentum: Optional[Union[float, int]] = 0, nesterov: bool = False, weight_decay: float = 0, ) -> Tuple[List[Dict], List[Dict]]: # Get (tuning, random_state, init_seed) assert int(tuning) or isinstance(tuning, bool) assert isinstance(random_state, int) assert isinstance(init_seed, int) if "NUM_THREADS" in os.environ: v = os.environ["NUM_THREADS"] if v: print(f"NUM_THREADS={v} (int(v)={int(v)})") torch.set_num_threads(int(v)) args: Dict[str, Any] = { "initial_batch_size": initial_batch_size, "max_batch_size": max_batch_size, "batch_growth_rate": batch_growth_rate, "dampingfactor": dampingfactor, "dampingdelay": dampingdelay, "epochs": epochs, "verbose": verbose, "lr": lr, "no_cuda": not cuda, "random_state": random_state, "init_seed": init_seed, "damper": damper, "dataset": dataset, "approx_loss": approx_loss, "test_freq": test_freq, "rho": rho, "dwell": dwell, "approx_rate": approx_rate, "nesterov": nesterov, "momentum": momentum, "weight_decay": weight_decay, } pprint(args) no_cuda = not cuda args["ident"] = ident(args) args["tuning"] = tuning use_cuda = not args["no_cuda"] and torch.cuda.is_available() device = "cuda" if use_cuda else "cpu" _device = torch.device(device) _set_seed(args["init_seed"]) transform_train = [ transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=(0.1307, ), std=(0.3081, )), ] transform_test = [ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ] assert dataset in ["fashionmnist", "cifar10", "synthetic"] if dataset == "fashionmnist": _dir = "_traindata/fashionmnist/" train_set = FashionMNIST( _dir, train=True, transform=Compose(transform_train), download=True, ) test_set = FashionMNIST(_dir, train=False, transform=Compose(transform_test)) model = Net() elif dataset == "cifar10": transform_train = [ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ] transform_test = [ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ] _dir = "_traindata/cifar10/" train_set = CIFAR10( _dir, train=True, transform=Compose(transform_train), download=True, ) test_set = CIFAR10(_dir, train=False, transform=Compose(transform_test)) if model == "wideresnet": model = WideResNet(16, 4, 0.3, 10) else: model = _get_resnet18() elif dataset == "synthetic": data_kwargs = {"n": 10_000, "d": 100} args.update(data_kwargs) train_set, test_set, data_stats = synth_dataset(**data_kwargs) args.update(data_stats) model = LinearNet(data_kwargs["d"]) else: raise ValueError( f"dataset={dataset} not in ['fashionmnist', 'cifar10', 'synth']") if tuning: train_size = int(0.8 * len(train_set)) test_size = len(train_set) - train_size train_set, test_set = random_split( train_set, [train_size, test_size], random_state=int(tuning), ) train_x = [x.abs().sum().item() for x, _ in train_set] train_y = [y for _, y in train_set] test_x = [x.abs().sum().item() for x, _ in test_set] test_y = [y for _, y in test_set] data_stats = { "train_x_sum": sum(train_x), "train_y_sum": sum(train_y), "test_x_sum": sum(test_x), "test_y_sum": sum(test_y), "len_train_x": len(train_x), "len_train_y": len(train_y), "len_test_x": len(test_x), "len_test_y": len(test_y), "tuning": int(tuning), } args.update(data_stats) pprint(data_stats) model = model.to(_device) _set_seed(args["random_state"]) if args["damper"] == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=args.get("lr", 0.01)) elif args["damper"] == "adadelta": optimizer = optim.Adadelta(model.parameters(), rho=rho) else: if not args["nesterov"]: assert args["momentum"] == 0 optimizer = optim.SGD(model.parameters(), lr=args["lr"], nesterov=args["nesterov"], momentum=args["momentum"], weight_decay=args["weight_decay"]) n_data = len(train_set) opt_args = [model, train_set, optimizer] opt_kwargs = { k: args[k] for k in ["initial_batch_size", "max_batch_size", "random_state"] } opt_kwargs["device"] = device if dataset == "synthetic": opt_kwargs["loss"] = F.mse_loss if dataset == "cifar10": opt_kwargs["loss"] = F.cross_entropy if args["damper"].lower() == "padadamp": if approx_rate: assert isinstance(max_batch_size, int) BM = max_batch_size B0 = initial_batch_size e = epochs n = n_data r_hat = 4 / 3 * (BM - B0) * (B0 + 2 * BM + 3) r_hat /= (2 * BM - 2 * B0 + 3 * e * n) args["batch_growth_rate"] = r_hat opt = PadaDamp( *opt_args, batch_growth_rate=args["batch_growth_rate"], dwell=args["dwell"], **opt_kwargs, ) elif args["damper"].lower() == "geodamp": opt = GeoDamp( *opt_args, dampingdelay=args["dampingdelay"], dampingfactor=args["dampingfactor"], **opt_kwargs, ) elif args["damper"].lower() == "geodamplr": opt = GeoDampLR( *opt_args, dampingdelay=args["dampingdelay"], dampingfactor=args["dampingfactor"], **opt_kwargs, ) elif args["damper"].lower() == "cntsdamplr": opt = CntsDampLR( *opt_args, dampingfactor=args["dampingfactor"], **opt_kwargs, ) elif args["damper"].lower() == "adadamp": opt = AdaDamp(*opt_args, approx_loss=approx_loss, dwell=args["dwell"], **opt_kwargs) elif args["damper"].lower() == "gd": opt = GradientDescent(*opt_args, **opt_kwargs) elif (args["damper"].lower() in ["adagrad", "adadelta", "sgd", "gd"] or args["damper"] is None): opt = BaseDamper(*opt_args, **opt_kwargs) else: raise ValueError("argument damper not recognized") if dataset == "synthetic": pprint(data_stats) opt._meta["best_train_loss"] = data_stats["best_train_loss"] data, train_data = experiment.run( model=model, opt=opt, train_set=train_set, test_set=test_set, args=args, test_freq=test_freq, train_stats=dataset == "synthetic", verbose=verbose, device="cuda" if use_cuda else "cpu", ) return data, train_data