示例#1
0
 def detect(self, img, min_distance=10):
     fns = [
         timeit(self.syms[i].transform,
                logger=self.log,
                name_fn=self.gpu_name) for i in xrange(self.levels)
     ]
     return self._detect(fns, img, min_distance)
示例#2
0
def gen_data_for(opts, task):
    bbs = BlockBlobService(account_name=auth.store_name(),
                           account_key=auth.store_key())
    sdimacs = from_blob(opts, bbs, task.bcnf, delete=False)
    ctx = solver.Context()
    s = solver.deserialize(ctx, sutil.mk_opts(opts), sdimacs)

    btfds = []
    new_bcnfs = []

    def push_tfd(tfd):
        btfds.append(to_blob(opts, bbs, sutil.tfd_to_py(tfd)))

    propagate_status = s.propagate()
    if propagate_status == solver.Status.UNKNOWN:
        s_pre_check = s.clone(ctx)
        assert (s_pre_check.propagate() == solver.Status.UNKNOWN)

        check_status, check_time = util.timeit(s.check)

        if check_status == solver.Status.SAT:
            pass
        elif check_status == solver.Status.UNSAT:
            push_tfd(s_pre_check.to_tf_data_with_core())
            [
                push_tfd(tfd) for tfd in s_pre_check.get_more_cores(
                    ctx=ctx,
                    max_tries=opts.find_max_tries,
                    percent_to_keep=opts.find_percent_to_keep)
            ]
        else:
            assert (check_status == solver.Status.UNKNOWN)
            s_pre_cube = s.clone(ctx)
            assert (s_pre_cube.propagate() == solver.Status.UNKNOWN)
            (cube_status, cubes), cube_time = util.timeit(s.cube)

            if cube_status == solver.Status.UNKNOWN:
                assert (len(cubes) in [1, 2])
                random.shuffle(cubes)
                for cube in cubes:
                    s_child = s.clone(ctx)
                    s_child.add(cube)
                    new_bcnfs.append(to_blob(opts, bbs, s_child.serialize()))

    return TaskResult(btfds=btfds, new_bcnfs=new_bcnfs)
示例#3
0
 def advance_batch():
   print("Step for model(%s) is %s"%(model.name, u.eval(model.step)))
   sess = u.get_default_session()
   # TODO: get rid of _sampled_labels
   sessrun([sampled_labels.initializer, _sampled_labels.initializer])
   if args.advance_batch:
     with u.timeit("advance_batch"):
       sessrun(advance_batch_op)
   sessrun(advance_step_op)
示例#4
0
def main() -> None:
    cmd = sys.argv[1]

    if cmd == "call":
        rpc.call(*sys.argv[2:])

    elif cmd == "rpc-step":
        payload = sys.argv[2]
        encoded = run_step("rpc-step", payload)
        sys.stdout.write(encoded)

    elif cmd == "stdin-rpc":
        payload = sys.argv[2]
        decoded = b64decode(payload)
        steps = pickle.loads(decoded)

        results = []

        for step in steps:
            user = step['user']
            msg = step['payload']
            current_user = check_output(["id", "--user",
                                         "--name"]).decode().strip()

            step_payload = b64encode(pickle.dumps(msg, 0)).decode()

            args = ["python3", sys.argv[0], "rpc-step", step_payload]

            if user is not None and user != current_user:
                args = ["sudo", "-u", user] + args
                with timeit("rpc-step"):
                    result = check_output(args).decode()
                results.append(result)

            else:
                with timeit("rpc-int"):
                    result = run_step("rpc-int", step_payload)
                results.append(result)

        sys.stdout.write(",".join(results))

    else:
        assert 0, "unknown command"
示例#5
0
def _build(name: str, repo: str, commit: str):
    # This is responsible for creating a build/ directory and building in it

    exe = find_program("git", "/usr/bin")

    def git(cmd):
        log(f"{exe} {cmd}")
        return check_call([exe] + cmd.split())

    git_home = os.path.expanduser("~/git")
    ensure_dir(git_home, 0o700)

    dst = repo.split("/")[-1]

    with cd(git_home):
        local_repo = os.path.abspath(dst)

        if not os.path.exists(dst):
            git(f"init -q --bare {dst}")
        with cd(dst):
            git("config --local uploadpack.allowreachablesha1inwant true")
            rc, existing_repo = subprocess.getstatusoutput(f"{exe} remote get-url origin")
            if rc != 0:
                git(f"remote add origin {repo}")
            elif existing_repo.strip() != repo:
                git(f"remote set-url origin {repo}")

            with timeit("remote fetch"):
                git("fetch")

    cmd = f"init -q build"
    git(cmd)

    with cd("build"):
        git(f"remote add origin {local_repo}")
        with timeit("local fetch"):
            git(f"fetch -q origin {commit} --depth 1")
        git(f"reset -q --hard {commit}")
        with timeit("build"):
            subprocess.check_call(f"./services/{name}/build.sh", stdout=sys.stderr)
示例#6
0
def backtrack(image, rows, cols, all_rows, all_cols, filter_interval=25):
    points = product(range(len(cols)), range(len(rows)))
    points = sorted(points,
                    key=lambda p: min(-abs(p[0] - len(cols) / 2), -abs(p[
                        1] - len(rows) / 2)))
    counter = 0
    for x, y in points:
        if image[y][x] == '?':
            image[y][x] = '#'
            _, possible = consequences(image, rows, cols, all_rows, all_cols)
            image[y][x] = '.'
            if possible:
                _, possible = consequences(image, rows, cols, all_rows,
                                           all_cols)
                if not possible:
                    image[y][x] = '#'
                else:
                    image[y][x] = '?'
        if counter >= filter_interval:
            all_rows, all_cols = filter_domains(image, all_rows, all_cols,
                                                True)
            image, possible = consequences(image,
                                           rows,
                                           cols,
                                           all_rows,
                                           all_cols,
                                           safe=True)
            timeit('SHOW')
            draw(image)
            counter = -1
        counter += 1

    image, possible = consequences(image,
                                   rows,
                                   cols,
                                   all_rows,
                                   all_cols,
                                   safe=True)
    return image, possible
示例#7
0
  def update_stats(self):  # todo: split into separate stats/svd updates
    """Updates all covariance/SVD info of correctable factors."""
    s = self
    ops = []

    # update covariances
    #    s.grad.update()   # TODO: not needed
    #    s.grad2.update()
    
    for var in s:
      ops.append(s[var].A.cov_update_op)
      ops.append(s[var].B2.cov_update_op)

    with u.timeit("covariances"):
      u.run(ops)

    # update SVDs
    corrected_vars = list(s)
    with u.timeit("svd"):
      with s.write_lock():
        for var in s:
          if not dont_update_first_layer or s[var].A.svd.update_counter==0:
            s[var].A.svd.update()
          s[var].B2.svd.update()
示例#8
0
 def query(self, args):
     try:
         guesses, n_secs_gpu = util.timeit(self.sess.run,
                                           self.guesses,
                                           feed_dict={
                                               self.n_vars: args.n_vars,
                                               self.n_clauses:
                                               args.n_clauses,
                                               self.CL_idxs: args.CL_idxs
                                           })
         return {
             "n_secs_gpu": n_secs_gpu,
             "pi_core_var_logits": guesses.pi_core_var_logits
         }
     except tf.errors.OpError as e:
         util.log(kind='error', author='pyro-server', msg=str(e))
         return None
示例#9
0
def main():

    u.install_pdb_handler()
    u.seed_random(1)
    logdir = u.create_local_logdir(args.logdir)
    run_name = os.path.basename(logdir)
    gl.event_writer = SummaryWriter(logdir)
    print(f"Logging to {logdir}")

    loss_type = 'CrossEntropy'

    d1 = args.data_width ** 2
    args.stats_batch_size = min(args.stats_batch_size, args.dataset_size)
    args.train_batch_size = min(args.train_batch_size, args.dataset_size)
    n = args.stats_batch_size
    o = 10
    d = [d1, 60, 60, 60, o]
    # dataset_size = args.dataset_size

    model = u.SimpleFullyConnected2(d, bias=True, nonlin=args.nonlin, last_layer_linear=True)
    model = model.to(gl.device)
    u.mark_expensive(model.layers[0])    # to stop grad1/hess calculations on this layer
    print(model)

    try:
        if args.wandb:
            wandb.init(project='curv_train_tiny', name=run_name, dir='/tmp/wandb.runs')
            wandb.tensorboard.patch(tensorboardX=False)
            wandb.config['train_batch'] = args.train_batch_size
            wandb.config['stats_batch'] = args.stats_batch_size
            wandb.config['n'] = n
    except Exception as e:
        print(f"wandb crash with {e}")

    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9)
    #  optimizer = torch.optim.Adam(model.parameters(), lr=0.03)  # make 10x smaller for least-squares loss
    dataset = u.TinyMNIST(data_width=args.data_width, dataset_size=args.dataset_size, loss_type=loss_type)

    train_loader = torch.utils.data.DataLoader(dataset, batch_size=args.train_batch_size, shuffle=False, drop_last=True)
    train_iter = u.infinite_iter(train_loader)

    stats_loader = torch.utils.data.DataLoader(dataset, batch_size=args.stats_batch_size, shuffle=False, drop_last=True)
    stats_iter = u.infinite_iter(stats_loader)
    stats_data, stats_targets = next(stats_iter)

    test_dataset = u.TinyMNIST(data_width=args.data_width, train=False, dataset_size=args.dataset_size, loss_type=loss_type)
    test_batch_size = min(args.dataset_size, 1000)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size, shuffle=False, drop_last=True)
    test_iter = u.infinite_iter(test_loader)

    if loss_type == 'LeastSquares':
        loss_fn = u.least_squares
    else:   # loss_type == 'CrossEntropy':
        loss_fn = nn.CrossEntropyLoss()

    autograd_lib.add_hooks(model)
    gl.reset_global_step()
    last_outer = 0
    val_losses = []
    for step in range(args.stats_steps):
        if last_outer:
            u.log_scalars({"time/outer": 1000*(time.perf_counter() - last_outer)})
        last_outer = time.perf_counter()

        with u.timeit("val_loss"):
            test_data, test_targets = next(test_iter)
            test_output = model(test_data)
            val_loss = loss_fn(test_output, test_targets)
            print("val_loss", val_loss.item())
            val_losses.append(val_loss.item())
            u.log_scalar(val_loss=val_loss.item())

        with u.timeit("validate"):
            if loss_type == 'CrossEntropy':
                val_accuracy, val_loss = validate(model, test_loader, f'test (stats_step {step})')
                # train_accuracy, train_loss = validate(model, train_loader, f'train (stats_step {step})')

                metrics = {'stats_step': step, 'val_accuracy': val_accuracy, 'val_loss': val_loss}
                u.log_scalars(metrics)

        data, targets = stats_data, stats_targets

        if not args.skip_stats:
            # Capture Hessian and gradient stats
            autograd_lib.enable_hooks()
            autograd_lib.clear_backprops(model)
            autograd_lib.clear_hess_backprops(model)
            with u.timeit("backprop_g"):
                output = model(data)
                loss = loss_fn(output, targets)
                loss.backward(retain_graph=True)
            with u.timeit("backprop_H"):
                autograd_lib.backprop_hess(output, hess_type=loss_type)
            autograd_lib.disable_hooks()   # TODO(y): use remove_hooks

            with u.timeit("compute_grad1"):
                autograd_lib.compute_grad1(model)
            with u.timeit("compute_hess"):
                autograd_lib.compute_hess(model)

            for (i, layer) in enumerate(model.layers):

                if hasattr(layer, 'expensive'):
                    continue

                param_names = {layer.weight: "weight", layer.bias: "bias"}
                for param in [layer.weight, layer.bias]:
                    # input/output layers are unreasonably expensive if not using Kronecker factoring
                    if d[i]*d[i+1] > 8000:
                        print(f'layer {i} is too big ({d[i],d[i+1]}), skipping stats')
                        continue

                    s = AttrDefault(str, {})  # dictionary-like object for layer stats

                    #############################
                    # Gradient stats
                    #############################
                    A_t = layer.activations
                    B_t = layer.backprops_list[0] * n
                    s.sparsity = torch.sum(layer.output <= 0) / layer.output.numel()  # proportion of activations that are zero
                    s.mean_activation = torch.mean(A_t)
                    s.mean_backprop = torch.mean(B_t)

                    # empirical Fisher
                    G = param.grad1.reshape((n, -1))
                    g = G.mean(dim=0, keepdim=True)

                    u.nan_check(G)
                    with u.timeit(f'sigma-{i}'):
                        efisher = G.t() @ G / n
                        sigma = efisher - g.t() @ g
                        # sigma_spectrum =
                        s.sigma_l2 = u.sym_l2_norm(sigma)
                        s.sigma_erank = torch.trace(sigma)/s.sigma_l2

                    H = param.hess
                    lambda_regularizer = args.lmb * torch.eye(H.shape[0]).to(gl.device)
                    u.nan_check(H)

                    with u.timeit(f"invH-{i}"):
                        invH = torch.cholesky_inverse(H+lambda_regularizer)

                    with u.timeit(f"H_l2-{i}"):
                        s.H_l2 = u.sym_l2_norm(H)
                        s.iH_l2 = u.sym_l2_norm(invH)

                    with u.timeit(f"norms-{i}"):
                        s.H_fro = H.flatten().norm()
                        s.iH_fro = invH.flatten().norm()
                        s.grad_fro = g.flatten().norm()
                        s.param_fro = param.data.flatten().norm()

                    def loss_direction(dd: torch.Tensor, eps):
                        """loss improvement if we take step eps in direction dd"""
                        return u.to_python_scalar(eps * (dd @ g.t()) - 0.5 * eps ** 2 * dd @ H @ dd.t())

                    def curv_direction(dd: torch.Tensor):
                        """Curvature in direction dd"""
                        return u.to_python_scalar(dd @ H @ dd.t() / (dd.flatten().norm() ** 2))

                    with u.timeit(f"pinvH-{i}"):
                        pinvH = u.pinv(H)

                    with u.timeit(f'curv-{i}'):
                        s.grad_curv = curv_direction(g)  # curvature (eigenvalue) in direction g
                        ndir = g @ pinvH  # newton direction
                        s.newton_curv = curv_direction(ndir)
                        setattr(layer.weight, 'pre', pinvH)  # save Newton preconditioner
                        s.step_openai = 1 / s.grad_curv if s.grad_curv else 1234567
                        s.step_div_inf = 2 / s.H_l2         # divegent step size for batch_size=infinity
                        s.step_div_1 = torch.tensor(2) / torch.trace(H)   # divergent step for batch_size=1

                        s.newton_fro = ndir.flatten().norm()  # frobenius norm of Newton update
                        s.regret_newton = u.to_python_scalar(g @ pinvH @ g.t() / 2)   # replace with "quadratic_form"
                        s.regret_gradient = loss_direction(g, s.step_openai)

                    with u.timeit(f'rho-{i}'):
                        s.rho, s.lyap_erank, lyap_evals = u.truncated_lyapunov_rho(H, sigma)
                        s.step_div_1_adjusted = s.step_div_1/s.rho

                    with u.timeit(f"batch-{i}"):
                        s.batch_openai = torch.trace(H @ sigma) / (g @ H @ g.t())
                        s.diversity = torch.norm(G, "fro") ** 2 / torch.norm(g) ** 2 / n  # Gradient diversity / n
                        s.noise_variance_pinv = torch.trace(pinvH @ sigma)
                        s.H_erank = torch.trace(H) / s.H_l2
                        s.batch_jain_simple = 1 + s.H_erank
                        s.batch_jain_full = 1 + s.rho * s.H_erank

                    param_name = f"{layer.name}={param_names[param]}"
                    u.log_scalars(u.nest_stats(f"{param_name}", s))

                    H_evals = u.symeig_pos_evals(H)
                    sigma_evals = u.symeig_pos_evals(sigma)
                    u.log_spectrum(f'{param_name}/hess', H_evals)
                    u.log_spectrum(f'{param_name}/sigma', sigma_evals)
                    u.log_spectrum(f'{param_name}/lyap', lyap_evals)

        # gradient steps
        with u.timeit('inner'):
            for i in range(args.train_steps):
                optimizer.zero_grad()
                data, targets = next(train_iter)
                model.zero_grad()
                output = model(data)
                loss = loss_fn(output, targets)
                loss.backward()

                optimizer.step()
                if args.weight_decay:
                    for group in optimizer.param_groups:
                        for param in group['params']:
                            param.data.mul_(1-args.weight_decay)

                gl.increment_global_step(data.shape[0])

    gl.event_writer.close()
示例#10
0
	def main1(cnt):
		timeit(v.save, "ESV", times=cnt)
		timeit(v.read, "ESV", times=cnt)
示例#11
0
import sys
from util import timeit
sys.setrecursionlimit(10000)

if __name__ == '__main__':
    from algorithms.is_prime_number import *

    test_cases = [(30, False), (1, False), (4, False), (9, False), (16, False), (25, False), (36, False), (49, False),
                  (64, False), (81, False), (100, False), (121, False), (144, False), (169, False), (196, False),
                  (225, False), (256, False), (289, False), (324, False), (361, False), (400, False), (441, False),
                  (484, False), (529, False), (576, False), (625, False), (676, False), (729, False), (784, False),
                  (841, False), (907, True)]
    l = [a for a, _ in test_cases]*1000
    timeit(lambda: [is_prime_standard(el) for el in l])
    timeit(lambda: [is_prime(el) for el in l])
    print("Tree stuff")
    from algorithms.linear_recursion import test
    test()
示例#12
0
 def main1(cnt):
     timeit(v.save, "ESV", times=cnt)
     timeit(v.read, "ESV", times=cnt)
示例#13
0
def test_factored_stats_golden_values():
    """Test stats from values generated by non-factored version"""
    u.seed_random(1)
    u.install_pdb_handler()
    torch.set_default_dtype(torch.float32)

    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    args = parser.parse_args()

    logdir = u.create_local_logdir('/temp/runs/factored_test')
    run_name = os.path.basename(logdir)
    gl.event_writer = SummaryWriter(logdir)
    print('logging to ', logdir)

    loss_type = 'LeastSquares'

    args.data_width = 2
    args.dataset_size = 5
    args.stats_batch_size = 5
    d1 = args.data_width**2
    args.stats_batch_size = args.dataset_size
    args.stats_steps = 1

    n = args.stats_batch_size
    o = 10
    d = [d1, o]

    model = u.SimpleFullyConnected2(d, bias=False, nonlin=0)
    model = model.to(gl.device)
    print(model)

    dataset = u.TinyMNIST(data_width=args.data_width,
                          dataset_size=args.dataset_size,
                          loss_type=loss_type)
    stats_loader = torch.utils.data.DataLoader(
        dataset, batch_size=args.stats_batch_size, shuffle=False)
    stats_iter = u.infinite_iter(stats_loader)
    stats_data, stats_targets = next(stats_iter)

    if loss_type == 'LeastSquares':
        loss_fn = u.least_squares
    else:  # loss_type == 'CrossEntropy':
        loss_fn = nn.CrossEntropyLoss()

    autograd_lib.add_hooks(model)
    gl.reset_global_step()
    last_outer = 0
    for step in range(args.stats_steps):
        if last_outer:
            u.log_scalars(
                {"time/outer": 1000 * (time.perf_counter() - last_outer)})
        last_outer = time.perf_counter()

        data, targets = stats_data, stats_targets

        # Capture Hessian and gradient stats
        autograd_lib.enable_hooks()
        autograd_lib.clear_backprops(model)
        with u.timeit("backprop_g"):
            output = model(data)
            loss = loss_fn(output, targets)
            loss.backward(retain_graph=True)

        autograd_lib.clear_hess_backprops(model)
        with u.timeit("backprop_H"):
            autograd_lib.backprop_hess(output, hess_type=loss_type)
        autograd_lib.disable_hooks()  # TODO(y): use remove_hooks

        with u.timeit("compute_grad1"):
            autograd_lib.compute_grad1(model)
        with u.timeit("compute_hess"):
            autograd_lib.compute_hess(model)
            autograd_lib.compute_hess(model, method='kron', attr_name='hess2')

        autograd_lib.compute_stats_factored(model)

        params = list(model.parameters())
        assert len(params) == 1
        new_values = params[0].stats
        golden_values = torch.load('test/factored.pt')

        for valname in new_values:
            print("Checking ", valname)
            if valname == 'sigma_l2':
                u.check_close(new_values[valname],
                              golden_values[valname],
                              atol=1e-2)  # sigma is approximate
            elif valname == 'sigma_erank':
                u.check_close(new_values[valname],
                              golden_values[valname],
                              atol=0.11)  # 1.0 vs 1.1
            elif valname in ['rho', 'step_div_1_adjusted', 'batch_jain_full']:
                continue  # lyapunov stats weren't computed correctly in golden set
            elif valname in ['batch_openai']:
                continue  # batch sizes depend on sigma which is approximate
            elif valname in ['noise_variance_pinv']:
                pass  # went from 0.22 to 0.014 after kron factoring (0.01 with full centering, 0.3 with no centering)
            elif valname in ['sparsity']:
                pass  # had a bug in old calc (using integer arithmetic)
            else:
                u.check_close(new_values[valname],
                              golden_values[valname],
                              rtol=1e-4,
                              atol=1e-6,
                              label=valname)

    gl.event_writer.close()
示例#14
0
 def read_lock(self):
   with u.timeit("read_lock"):
     with self.lock:
       yield
示例#15
0
        sim.taskmanager.Task("Hunt", sim.taskmanager.TaskType.GATHER),
        sim.taskmanager.Task("Cook", sim.taskmanager.TaskType.ACTIVITY),
        sim.taskmanager.Task("Clean", sim.taskmanager.TaskType.ACTIVITY),
        sim.taskmanager.Task("Gather Water", sim.taskmanager.TaskType.GATHER),
    ]
    task_manager.randomly_assign(tasks)

    # terrain
    terrain = sim.world.Terrain([
        sim.world.Resource("Hunt", 2, entity.Point(-55, -33, 0)),
        sim.world.Resource("Cook", 4, entity.Point(15, 25, 0)),
        sim.world.Resource("Clean", 2, entity.Point(0, 0, 0)),
        sim.world.Resource("Gather Water", 100, entity.Point(5, 5, 0)),
    ])

    # renderer
    renderer = Renderer()
    event_processor = EventProcessor()

    # run sim
    world = sim.world.World(terrain, entity_manager, task_manager)
    while True:
        sim.world.run(world, renderer, event_processor)
        time.sleep(0.16)


if __name__ == "__main__":
    os.system("cls")
    time_ran = util.timeit(run)
    print(f"Runtime: [bold green]{time_ran} [red]seconds")
示例#16
0
def main():
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    logger = u.TensorboardLogger(args.run)

    with u.timeit("init/session"):

        rewrite_options = None
        try:
            from tensorflow.core.protobuf import rewriter_config_pb2
            rewrite_options = rewriter_config_pb2.RewriterConfig(
                disable_model_pruning=True,
                constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
                memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
        except:
            pass

        optimizer_options = tf.OptimizerOptions(
            opt_level=tf.OptimizerOptions.L0)
        graph_options = tf.GraphOptions(optimizer_options=optimizer_options,
                                        rewrite_options=rewrite_options)
        gpu_options = tf.GPUOptions(allow_growth=False)
        config = tf.ConfigProto(graph_options=graph_options,
                                gpu_options=gpu_options,
                                log_device_placement=False)

        sess = tf.InteractiveSession(config=config)
        u.register_default_session(
            sess)  # since default session is Thread-local

    with u.timeit("init/model_init"):
        model = model_creator(args.batch_size, name="main")
        model.initialize_global_vars(verbose=True)
        model.initialize_local_vars()

    kfac_lib.numeric_inverse = args.numeric_inverse
    with u.timeit("init/kfac_init"):
        kfac = Kfac(model_creator, args.kfac_batch_size)
        kfac.model.initialize_global_vars(verbose=False)
        kfac.model.initialize_local_vars()
        kfac.Lambda.set(args.Lambda)
        kfac.reset()  # resets optimization variables (not model variables)

    if args.mode != 'run':
        opt = tf.train.AdamOptimizer(0.001)
    else:
        opt = tf.train.AdamOptimizer(args.lr)
    grads_and_vars = opt.compute_gradients(model.loss,
                                           var_list=model.trainable_vars)

    grad = IndexedGrad.from_grads_and_vars(grads_and_vars)
    grad_new = kfac.correct(grad)
    with u.capture_vars() as adam_vars:
        train_op = opt.apply_gradients(grad_new.to_grads_and_vars())
    with u.timeit("init/adam"):
        sessrun([v.initializer for v in adam_vars])

    losses = []
    u.record_time()

    start_time = time.time()
    vloss0 = 0

    # todo, unify the two data outputs
    outfn = 'data/%s_%f_%f.csv' % (args.run, args.lr, args.Lambda)

    start_time = time.time()
    if args.extra_kfac_batch_advance:
        kfac.model.advance_batch()  # advance kfac batch

    if args.kfac_async:
        kfac.start_stats_runners()

    for step in range(args.num_steps):

        if args.validate_every_n and step % args.validate_every_n == 0:
            loss0, vloss0 = sessrun([model.loss, model.vloss])
        else:
            loss0, = sessrun([model.loss])
        losses.append(loss0)  # TODO: remove this

        logger('loss/loss', loss0, 'loss/vloss', vloss0)

        elapsed = time.time() - start_time
        start_time = time.time()
        print("%4d ms, step %4d, loss %5.2f, vloss %5.2f" %
              (elapsed * 1e3, step, loss0, vloss0))

        if args.method == 'kfac' and not args.kfac_async:
            kfac.model.advance_batch()
            kfac.update_stats()

        with u.timeit("train"):
            model.advance_batch()
            with u.timeit("grad.update"):
                grad.update()
            with kfac.read_lock():
                grad_new.update()
            u.run(train_op)
            u.record_time()

        logger.next_step()

    # TODO: use u.global_runs_dir
    # TODO: get rid of u.timeit?

    with open('timelines/graphdef.txt', 'w') as f:
        f.write(str(u.get_default_graph().as_graph_def()))

    u.summarize_time()

    if args.mode == 'record':
        u.dump_with_prompt(losses, release_test_fn)

    elif args.mode == 'test':
        targets = np.loadtxt('data/' + release_test_fn, delimiter=",")
        u.check_equal(losses, targets, rtol=1e-2)
        u.summarize_difference(losses, targets)
        assert u.last_time() < 800, "Expected 648 on GTX 1080"
示例#17
0
def main():
    u.reset_timeit()

    iters = 11
    n = 10000

    print(f"Benchmarking n={n}")

    ############################################################
    # Numpy
    ############################################################
    A = scipy.randn(n, n)  # random matrix
    A = A @ A.T  # positive definite matrix
    A = scipy.linalg.cholesky(A)  # upper diagonal matrix
    b = scipy.randn(n)

    u.reset_timeit()
    for i in range(iters):
        with u.timeit('numpy'):
            scipy.linalg.solve_triangular(A, b)

    ############################################################
    # PyTorch GPU
    ############################################################
    A = torch.randn(n, n)
    A = A @ A.t() + torch.diag(torch.ones(n))
    A = torch.potrf(A).cuda()
    b = torch.randn(n, 1).cuda()

    # prewarm
    torch.trtrs(b, A)
    for i in range(iters):
        torch.cuda.synchronize()
        with u.timeit('Pytorch GPU'):
            result = torch.trtrs(b, A)
            torch.cuda.synchronize()
        del result

    ############################################################
    # PyTorch CPU
    ############################################################
    A = torch.randn(n, n)
    A = A @ A.t() + torch.diag(torch.ones(n))
    A = torch.potrf(A)
    b = torch.randn(n, 1)

    # prewarm
    (result, A_clone) = torch.trtrs(b, A)
    assert result.device.type == 'cpu'

    for i in range(iters):
        torch.cuda.synchronize()
        with u.timeit('Pytorch CPU'):
            result = torch.trtrs(b, A)
            torch.cuda.synchronize()
        del result

    ############################################################
    # PyTorch GPU
    ############################################################
    A = torch.randn(n, n)
    A = A @ A.t() + torch.diag(torch.ones(n))
    A = torch.potrf(A).cuda()
    b = torch.randn(n, 1).cuda()

    # prewarm
    (result, A_clone) = torch.trtrs(b, A)
    assert result.device.type == 'cuda'
    for i in range(iters):
        torch.cuda.synchronize()
        with u.timeit('Pytorch GPU'):
            (result, dummy) = torch.trtrs(b, A)
            print(result[0, 0])
            #      torch.cuda.synchronize()
        del result

    ############################################################
    # Tensorflow GPU
    ############################################################
    A = tf.random_normal((n, n)).gpu()
    b = tf.random_normal((n, 1)).gpu()
    A = A @ tf.transpose(A) + tf.diag(tf.ones(
        (n, )))  # bug, diag is needed, or Cholesky fails
    A = tf.cholesky(A)
    # bug, Should be able to do constant conversion, but fails with
    # Internal: failed to query device pointer for context: CUDA_ERROR_INVALID_VALUE
    #  A = tf.constant(A).gpu()
    #  b = tf.constant(b).gpu()

    # prewarm
    result = tf.contrib.eager.Variable(tf.zeros((n, 1)))
    result.assign(tf.linalg.triangular_solve(A, b))
    assert 'gpu' in result.device.lower()
    for i in range(iters):
        b += 1  # prevent caching
        with u.timeit('TF GPU'):
            result.assign(tf.linalg.triangular_solve(A, b))
            print(result[0, 0])

    ############################################################
    # Tensorflow CPU
    ############################################################
    A = tf.random_normal((n, n)).cpu()
    b = tf.random_normal((n, 1)).cpu()
    A = A @ tf.transpose(A) + tf.diag(tf.ones(
        (n, )))  # bug, diag is needed, or Cholesky fails
    A = tf.cholesky(A)
    A = A.cpu()
    b = b.cpu()

    # prewarm
    with tf.device('/cpu:0'):
        result = tf.contrib.eager.Variable(tf.zeros((n, 1)))
    result.assign(tf.linalg.triangular_solve(A, b))
    assert 'cpu' in result.device.lower()
    for i in range(iters):
        b += 1  # prevent caching
        with u.timeit('TF CPU'):
            result.assign(tf.linalg.triangular_solve(A, b))

    u.summarize_timeit()
示例#18
0
        #
        # add offset to value of (m, m) to get value at (y, x):
        # if square(m) is even, the max value for (m, m) lies on y-axis:
        # v += y - x  # move towards the max axis value
        # if square(m) is odd, the max value for (m, m) lies on x-axis
        # v += x - y
        v = m**2 - m + 1 + (y-x if m % 2 == 0 else x-y)
        r.append(v)
    return r


if __name__ == '__main__':
    for r in range(1, 7):
        tests = (f'{r} {c}' for c in range(1, 7))
        print(*number_spiral(*tests), sep='\t')

    print()
    benchmark = timeit(lambda *tests: number_spiral(*tests))
    benchmark('2 3', '1 1', '4 2')
''' terminal
1       2       9       10      25      26
4       3       8       11      24      27
5       6       7       12      23      28
16      15      14      13      22      29
17      18      19      20      21      30
36      35      34      33      32      31

run <lambda>('2 3', '1 1', '4 2')
got [8, 1, 15] in 0.0000378000 secs.
'''
示例#19
0
def main():

    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=10,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')

    parser.add_argument('--wandb',
                        type=int,
                        default=0,
                        help='log to weights and biases')
    parser.add_argument('--autograd_check',
                        type=int,
                        default=0,
                        help='autograd correctness checks')
    parser.add_argument('--logdir',
                        type=str,
                        default='/tmp/runs/curv_train_tiny/run')

    parser.add_argument('--nonlin',
                        type=int,
                        default=1,
                        help="whether to add ReLU nonlinearity between layers")
    parser.add_argument('--bias',
                        type=int,
                        default=1,
                        help="whether to add bias between layers")

    parser.add_argument('--layer',
                        type=int,
                        default=-1,
                        help="restrict updates to this layer")
    parser.add_argument('--data_width', type=int, default=28)
    parser.add_argument('--targets_width', type=int, default=28)
    parser.add_argument(
        '--hess_samples',
        type=int,
        default=1,
        help='number of samples when sub-sampling outputs, 0 for exact hessian'
    )
    parser.add_argument('--hess_kfac',
                        type=int,
                        default=0,
                        help='whether to use KFAC approximation for hessian')
    parser.add_argument('--compute_rho',
                        type=int,
                        default=0,
                        help='use expensive method to compute rho')
    parser.add_argument('--skip_stats',
                        type=int,
                        default=1,
                        help='skip all stats collection')

    parser.add_argument('--dataset_size', type=int, default=60000)
    parser.add_argument('--train_steps',
                        type=int,
                        default=1000,
                        help="this many train steps between stat collection")
    parser.add_argument('--stats_steps',
                        type=int,
                        default=1000000,
                        help="total number of curvature stats collections")

    parser.add_argument('--full_batch',
                        type=int,
                        default=0,
                        help='do stats on the whole dataset')
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--weight_decay', type=float, default=2e-5)
    parser.add_argument('--momentum', type=float, default=0.9)
    parser.add_argument('--dropout', type=int, default=0)
    parser.add_argument('--swa', type=int, default=0)
    parser.add_argument('--lmb', type=float, default=1e-3)

    parser.add_argument('--train_batch_size', type=int, default=64)
    parser.add_argument('--stats_batch_size', type=int, default=10000)
    parser.add_argument('--uniform',
                        type=int,
                        default=0,
                        help='use uniform architecture (all layers same size)')
    parser.add_argument('--run_name', type=str, default='noname')

    gl.args = parser.parse_args()
    args = gl.args
    u.seed_random(1)

    gl.project_name = 'train_ciresan'
    u.setup_logdir_and_event_writer(args.run_name)
    print(f"Logging to {gl.logdir}")

    d1 = 28 * 28
    if args.uniform:
        d = [784, 784, 784, 784, 784, 784, 10]
    else:
        d = [784, 2500, 2000, 1500, 1000, 500, 10]
    o = 10
    n = args.stats_batch_size
    model = u.SimpleFullyConnected2(d,
                                    nonlin=args.nonlin,
                                    bias=args.bias,
                                    dropout=args.dropout)
    model = model.to(gl.device)

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                momentum=args.momentum)
    dataset = u.TinyMNIST(data_width=args.data_width,
                          targets_width=args.targets_width,
                          original_targets=True,
                          dataset_size=args.dataset_size)
    train_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=args.train_batch_size,
        shuffle=True,
        drop_last=True)
    train_iter = u.infinite_iter(train_loader)

    assert not args.full_batch, "fixme: validation still uses stats_iter"
    if not args.full_batch:
        stats_loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=args.stats_batch_size,
            shuffle=False,
            drop_last=True)
        stats_iter = u.infinite_iter(stats_loader)
    else:
        stats_iter = None

    test_dataset = u.TinyMNIST(data_width=args.data_width,
                               targets_width=args.targets_width,
                               train=False,
                               original_targets=True,
                               dataset_size=args.dataset_size)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.stats_batch_size,
                                              shuffle=False,
                                              drop_last=False)

    loss_fn = torch.nn.CrossEntropyLoss()
    autograd_lib.add_hooks(model)
    autograd_lib.disable_hooks()

    gl.token_count = 0
    last_outer = 0
    for step in range(args.stats_steps):
        epoch = gl.token_count // 60000
        print(gl.token_count)
        if last_outer:
            u.log_scalars(
                {"time/outer": 1000 * (time.perf_counter() - last_outer)})
        last_outer = time.perf_counter()

        # compute validation loss
        if args.swa:
            model.eval()
            with u.timeit('swa'):
                base_opt = torch.optim.SGD(model.parameters(),
                                           lr=args.lr,
                                           momentum=args.momentum)
                opt = torchcontrib.optim.SWA(base_opt,
                                             swa_start=0,
                                             swa_freq=1,
                                             swa_lr=args.lr)
                for _ in range(100):
                    optimizer.zero_grad()
                    data, targets = next(train_iter)
                    model.zero_grad()
                    output = model(data)
                    loss = loss_fn(output, targets)
                    loss.backward()
                    opt.step()
                opt.swap_swa_sgd()

        with u.timeit("validate"):
            val_accuracy, val_loss = validate(model, test_loader,
                                              f'test (epoch {epoch})')
            train_accuracy, train_loss = validate(model, stats_loader,
                                                  f'train (epoch {epoch})')

        # save log
        metrics = {
            'epoch': epoch,
            'val_accuracy': val_accuracy,
            'val_loss': val_loss,
            'train_loss': train_loss,
            'train_accuracy': train_accuracy,
            'lr': optimizer.param_groups[0]['lr'],
            'momentum': optimizer.param_groups[0].get('momentum', 0)
        }
        u.log_scalars(metrics)

        # compute stats
        if args.full_batch:
            data, targets = dataset.data, dataset.targets
        else:
            data, targets = next(stats_iter)

        if not args.skip_stats:
            autograd_lib.enable_hooks()
            autograd_lib.clear_backprops(model)
            autograd_lib.clear_hess_backprops(model)
            with u.timeit("backprop_g"):
                output = model(data)
                loss = loss_fn(output, targets)
                loss.backward(retain_graph=True)
            with u.timeit("backprop_H"):
                autograd_lib.backprop_hess(output, hess_type='CrossEntropy')
            autograd_lib.disable_hooks()  # TODO(y): use remove_hooks

            with u.timeit("compute_grad1"):
                autograd_lib.compute_grad1(model)
            with u.timeit("compute_hess"):
                autograd_lib.compute_hess(model,
                                          method='kron',
                                          attr_name='hess2')
            autograd_lib.compute_stats_factored(model)

        for (i, layer) in enumerate(model.layers):
            param_names = {layer.weight: "weight", layer.bias: "bias"}
            for param in [layer.weight, layer.bias]:

                if param is None:
                    continue

                if not hasattr(param, 'stats'):
                    continue
                s = param.stats
                param_name = param_names[param]
                u.log_scalars(u.nest_stats(f"{param_name}", s))

        # gradient steps
        model.train()
        last_inner = 0
        for i in range(args.train_steps):
            if last_inner:
                u.log_scalars(
                    {"time/inner": 1000 * (time.perf_counter() - last_inner)})
            last_inner = time.perf_counter()

            optimizer.zero_grad()
            data, targets = next(train_iter)
            model.zero_grad()
            output = model(data)
            loss = loss_fn(output, targets)
            loss.backward()

            optimizer.step()
            if args.weight_decay:
                for group in optimizer.param_groups:
                    for param in group['params']:
                        param.data.mul_(1 - args.weight_decay)

            gl.token_count += data.shape[0]

    gl.event_writer.close()
示例#20
0
from subprocess import check_output
import util
import sys
import math
from util import N, timeit
from functools import partial
import multiprocessing
import pi_serial
import pi_multiprocessing
#import pi_mpi
import pi_openCL


def do_mpi(cpus):
	util.mpi_verbose = False #make sure pi_mpi.py prints only its result to console
	mpi_path = sys.path[0]
	#get result and strip trailing "\n"
	return check_output(["mpiexec -n " + str(int(cpus)) + " python ./pi_mpi.py"], shell=True)[:-2]

if __name__=='__main__':
	print("N: " + "{:,}".format(N))
	print("Pi: " + str(math.pi))
	pi_serial.run()
	pi_multiprocessing.run()
	pi_openCL.run()
	#do mpi:
	for cpus in range(multiprocessing.cpu_count()):
		timeit(partial(do_mpi, cpus+1), "MPI with " + str(cpus+1) + " CPUs")
示例#21
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=10, metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')

    parser.add_argument('--wandb', type=int, default=0, help='log to weights and biases')
    parser.add_argument('--autograd_check', type=int, default=0, help='autograd correctness checks')
    parser.add_argument('--logdir', type=str, default='/tmp/runs/curv_train_tiny/run')

    parser.add_argument('--nonlin', type=int, default=1, help="whether to add ReLU nonlinearity between layers")
    parser.add_argument('--bias', type=int, default=1, help="whether to add bias between layers")

    parser.add_argument('--layer', type=int, default=-1, help="restrict updates to this layer")
    parser.add_argument('--data_width', type=int, default=28)
    parser.add_argument('--targets_width', type=int, default=28)
    parser.add_argument('--hess_samples', type=int, default=1,
                        help='number of samples when sub-sampling outputs, 0 for exact hessian')
    parser.add_argument('--hess_kfac', type=int, default=0, help='whether to use KFAC approximation for hessian')
    parser.add_argument('--compute_rho', type=int, default=0, help='use expensive method to compute rho')
    parser.add_argument('--skip_stats', type=int, default=0, help='skip all stats collection')

    parser.add_argument('--dataset_size', type=int, default=60000)
    parser.add_argument('--train_steps', type=int, default=100, help="this many train steps between stat collection")
    parser.add_argument('--stats_steps', type=int, default=1000000, help="total number of curvature stats collections")

    parser.add_argument('--full_batch', type=int, default=0, help='do stats on the whole dataset')
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--weight_decay', type=float, default=0)
    parser.add_argument('--momentum', type=float, default=0.9)
    parser.add_argument('--dropout', type=int, default=0)
    parser.add_argument('--swa', type=int, default=0)
    parser.add_argument('--lmb', type=float, default=1e-3)

    parser.add_argument('--train_batch_size', type=int, default=64)
    parser.add_argument('--stats_batch_size', type=int, default=10000)
    parser.add_argument('--stats_num_batches', type=int, default=1)
    parser.add_argument('--run_name', type=str, default='noname')
    parser.add_argument('--launch_blocking', type=int, default=0)
    parser.add_argument('--sampled', type=int, default=0)
    parser.add_argument('--curv', type=str, default='kfac',
                        help='decomposition to use for curvature estimates: zero_order, kfac, isserlis or full')
    parser.add_argument('--log_spectra', type=int, default=0)

    u.seed_random(1)
    gl.args = parser.parse_args()
    args = gl.args
    u.seed_random(1)

    gl.project_name = 'train_ciresan'
    u.setup_logdir_and_event_writer(args.run_name)
    print(f"Logging to {gl.logdir}")

    d1 = 28 * 28
    d = [784, 2500, 2000, 1500, 1000, 500, 10]

    # number of samples per datapoint. Used to normalize kfac
    model = u.SimpleFullyConnected2(d, nonlin=args.nonlin, bias=args.bias, dropout=args.dropout)
    model = model.to(gl.device)
    autograd_lib.register(model)

    assert args.dataset_size >= args.stats_batch_size
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
    dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, original_targets=True,
                          dataset_size=args.dataset_size)
    train_loader = torch.utils.data.DataLoader(dataset, batch_size=args.train_batch_size, shuffle=True, drop_last=True)
    train_iter = u.infinite_iter(train_loader)

    assert not args.full_batch, "fixme: validation still uses stats_iter"
    if not args.full_batch:
        stats_loader = torch.utils.data.DataLoader(dataset, batch_size=args.stats_batch_size, shuffle=True,
                                                   drop_last=True)
        stats_iter = u.infinite_iter(stats_loader)
    else:
        stats_iter = None

    test_dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, train=False,
                               original_targets=True,
                               dataset_size=args.dataset_size)
    test_eval_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.stats_batch_size, shuffle=False,
                                                   drop_last=False)
    train_eval_loader = torch.utils.data.DataLoader(dataset, batch_size=args.stats_batch_size, shuffle=False,
                                                    drop_last=False)

    loss_fn = torch.nn.CrossEntropyLoss()
    autograd_lib.add_hooks(model)
    autograd_lib.disable_hooks()

    gl.token_count = 0
    last_outer = 0

    for step in range(args.stats_steps):
        epoch = gl.token_count // 60000
        lr = optimizer.param_groups[0]['lr']
        print('token_count', gl.token_count)
        if last_outer:
            u.log_scalars({"time/outer": 1000 * (time.perf_counter() - last_outer)})
            print(f'time: {time.perf_counter() - last_outer:.2f}')
        last_outer = time.perf_counter()

        with u.timeit("validate"):
            val_accuracy, val_loss = validate(model, test_eval_loader, f'test (epoch {epoch})')
            train_accuracy, train_loss = validate(model, train_eval_loader, f'train (epoch {epoch})')

        # save log
        metrics = {'epoch': epoch, 'val_accuracy': val_accuracy, 'val_loss': val_loss,
                   'train_loss': train_loss, 'train_accuracy': train_accuracy,
                   'lr': optimizer.param_groups[0]['lr'],
                   'momentum': optimizer.param_groups[0].get('momentum', 0)}
        u.log_scalars(metrics)

        def mom_update(buffer, val):
            buffer *= 0.9
            buffer += val * 0.1

        if not args.skip_stats:
            # number of samples passed through
            n = args.stats_batch_size * args.stats_num_batches

            # quanti
            forward_stats = defaultdict(lambda: AttrDefault(float))

            hessians = defaultdict(lambda: AttrDefault(float))
            jacobians = defaultdict(lambda: AttrDefault(float))
            fishers = defaultdict(lambda: AttrDefault(float))  # empirical fisher/gradient
            quad_fishers = defaultdict(lambda: AttrDefault(float))  # gradient statistics that depend on fisher (4th order moments)
            train_regrets = defaultdict(list)
            test_regrets1 = defaultdict(list)
            test_regrets2 = defaultdict(list)
            train_regrets_opt = defaultdict(list)
            test_regrets_opt = defaultdict(list)
            cosines = defaultdict(list)
            dot_products = defaultdict(list)
            hessians_histograms = defaultdict(lambda: AttrDefault(u.MyList))
            jacobians_histograms = defaultdict(lambda: AttrDefault(u.MyList))
            fishers_histograms = defaultdict(lambda: AttrDefault(u.MyList))
            quad_fishers_histograms = defaultdict(lambda: AttrDefault(u.MyList))

            current = None
            current_histograms = None

            for i in range(args.stats_num_batches):
                activations = {}
                backprops = {}

                def save_activations(layer, A, _):
                    activations[layer] = A
                    forward_stats[layer].AA += torch.einsum("ni,nj->ij", A, A)

                print('forward')
                with u.timeit("stats_forward"):
                    with autograd_lib.module_hook(save_activations):
                        data, targets = next(stats_iter)
                        output = model(data)
                        loss = loss_fn(output, targets) * len(output)

                def compute_stats(layer, _, B):
                    A = activations[layer]
                    if current == fishers:
                        backprops[layer] = B

                    # about 27ms per layer
                    with u.timeit('compute_stats'):
                        current[layer].BB += torch.einsum("ni,nj->ij", B, B)  # TODO(y): index consistency
                        current[layer].diag += torch.einsum("ni,nj->ij", B * B, A * A)
                        current[layer].BA += torch.einsum("ni,nj->ij", B, A)
                        current[layer].a += torch.einsum("ni->i", A)
                        current[layer].b += torch.einsum("nk->k", B)
                        current[layer].norm2 += ((A * A).sum(dim=1) * (B * B).sum(dim=1)).sum()

                        # compute curvatures in direction of all gradiennts
                        if current is fishers:
                            assert args.stats_num_batches == 1, "not tested on more than one stats step, currently reusing aggregated moments"
                            hess = hessians[layer]
                            jac = jacobians[layer]
                            Bh, Ah = B @ hess.BB / n, A @ forward_stats[layer].AA / n
                            Bj, Aj = B @ jac.BB / n, A @ forward_stats[layer].AA / n
                            norms = ((A * A).sum(dim=1) * (B * B).sum(dim=1))

                            current[layer].min_norm2 = min(norms)
                            current[layer].median_norm2 = torch.median(norms)
                            current[layer].max_norm2 = max(norms)

                            norms2_hess = ((Ah * A).sum(dim=1) * (Bh * B).sum(dim=1))
                            norms2_jac = ((Aj * A).sum(dim=1) * (Bj * B).sum(dim=1))

                            current[layer].norm += norms.sum()
                            current_histograms[layer].norms.extend(torch.sqrt(norms))
                            current[layer].curv_hess += (skip_nans(norms2_hess / norms)).sum()
                            current_histograms[layer].curv_hess.extend(skip_nans(norms2_hess / norms))
                            current[layer].curv_hess_max += (skip_nans(norms2_hess / norms)).max()
                            current[layer].curv_hess_median += (skip_nans(norms2_hess / norms)).median()

                            current_histograms[layer].curv_jac.extend(skip_nans(norms2_jac / norms))
                            current[layer].curv_jac += (skip_nans(norms2_jac / norms)).sum()
                            current[layer].curv_jac_max += (skip_nans(norms2_jac / norms)).max()
                            current[layer].curv_jac_median += (skip_nans(norms2_jac / norms)).median()

                            current[layer].a_sparsity += torch.sum(A <= 0).float() / A.numel()
                            current[layer].b_sparsity += torch.sum(B <= 0).float() / B.numel()

                            current[layer].mean_activation += torch.mean(A)
                            current[layer].mean_activation2 += torch.mean(A*A)
                            current[layer].mean_backprop = torch.mean(B)
                            current[layer].mean_backprop2 = torch.mean(B*B)

                            current[layer].norms_hess += torch.sqrt(norms2_hess).sum()
                            current_histograms[layer].norms_hess.extend(torch.sqrt(norms2_hess))
                            current[layer].norms_jac += norms2_jac.sum()
                            current_histograms[layer].norms_jac.extend(torch.sqrt(norms2_jac))

                            normalized_moments = copy.copy(hessians[layer])
                            normalized_moments.AA = forward_stats[layer].AA
                            normalized_moments = u.divide_attributes(normalized_moments, n)

                            train_regrets_ = autograd_lib.offset_losses(A, B, alpha=lr, offset=0, m=normalized_moments,
                                                                        approx=args.curv)
                            test_regrets1_ = autograd_lib.offset_losses(A, B, alpha=lr, offset=1, m=normalized_moments,
                                                                        approx=args.curv)
                            test_regrets2_ = autograd_lib.offset_losses(A, B, alpha=lr, offset=2, m=normalized_moments,
                                                                        approx=args.curv)
                            test_regrets_opt_ = autograd_lib.offset_losses(A, B, alpha=None, offset=2,
                                                                           m=normalized_moments, approx=args.curv)
                            train_regrets_opt_ = autograd_lib.offset_losses(A, B, alpha=None, offset=0,
                                                                            m=normalized_moments, approx=args.curv)
                            cosines_ = autograd_lib.offset_cosines(A, B)
                            train_regrets[layer].extend(train_regrets_)
                            test_regrets1[layer].extend(test_regrets1_)
                            test_regrets2[layer].extend(test_regrets2_)
                            train_regrets_opt[layer].extend(train_regrets_opt_)
                            test_regrets_opt[layer].extend(test_regrets_opt_)
                            cosines[layer].extend(cosines_)
                            dot_products[layer].extend(autograd_lib.offset_dotprod(A, B))

                        # statistics of the form g.Sigma.g
                        elif current == quad_fishers:
                            hess = hessians[layer]
                            sigma = fishers[layer]
                            jac = jacobians[layer]
                            Bs, As = B @ sigma.BB / n, A @ forward_stats[layer].AA / n
                            Bh, Ah = B @ hess.BB / n, A @ forward_stats[layer].AA / n
                            Bj, Aj = B @ jac.BB / n, A @ forward_stats[layer].AA / n

                            norms = ((A * A).sum(dim=1) * (B * B).sum(dim=1))
                            norms2_hess = ((Ah * A).sum(dim=1) * (Bh * B).sum(dim=1))
                            norms2_jac = ((Aj * A).sum(dim=1) * (Bj * B).sum(dim=1))
                            norms_sigma = ((As * A).sum(dim=1) * (Bs * B).sum(dim=1))

                            current[layer].norm += norms.sum()  # TODO(y) remove, redundant with norm2 above
                            current[layer].curv_sigma += (skip_nans(norms_sigma / norms)).sum()
                            current[layer].curv_sigma_max = skip_nans(norms_sigma / norms).max()
                            current[layer].curv_sigma_median = skip_nans(norms_sigma / norms).median()
                            current[layer].curv_hess += skip_nans(norms2_hess / norms).sum()
                            current[layer].curv_hess_max += skip_nans(norms2_hess / norms).max()
                            current[layer].lyap_hess_mean += skip_nans(norms_sigma / norms2_hess).mean()
                            current[layer].lyap_hess_max = max(skip_nans(norms_sigma/norms2_hess))
                            current[layer].lyap_jac_mean += skip_nans(norms_sigma / norms2_jac).mean()
                            current[layer].lyap_jac_max = max(skip_nans(norms_sigma/norms2_jac))

                print('backward')
                with u.timeit("backprop_H"):
                    with autograd_lib.module_hook(compute_stats):
                        current = hessians
                        current_histograms = hessians_histograms
                        autograd_lib.backward_hessian(output, loss='CrossEntropy', sampled=args.sampled,
                                                      retain_graph=True)  # 600 ms
                        current = jacobians
                        current_histograms = jacobians_histograms
                        autograd_lib.backward_jacobian(output, sampled=args.sampled, retain_graph=True)  # 600 ms
                        current = fishers
                        current_histograms = fishers_histograms
                        model.zero_grad()
                        loss.backward(retain_graph=True)  # 60 ms
                        current = quad_fishers
                        current_histograms = quad_fishers_histograms
                        model.zero_grad()
                        loss.backward()  # 60 ms

            print('summarize')
            for (i, layer) in enumerate(model.layers):
                stats_dict = {'hessian': hessians, 'jacobian': jacobians, 'fisher': fishers}

                # evaluate stats from
                # https://app.wandb.ai/yaroslavvb/train_ciresan/runs/425pu650?workspace=user-yaroslavvb
                for stats_name in stats_dict:
                    s = AttrDict()
                    stats = stats_dict[stats_name][layer]

                    for key in forward_stats[layer]:
                        # print(f'copying {key} in {stats_name}, {layer}')
                        try:
                            assert stats[key] == float()
                        except:
                            f"Trying to overwrite {key} in {stats_name}, {layer}"
                        stats[key] = forward_stats[layer][key]

                    diag: torch.Tensor = stats.diag / n

                    # jacobian:
                    # curv in direction of gradient goes down to roughly 0.3-1
                    # maximum curvature goes up to 1000-2000
                    #
                    # Hessian:
                    # max curv goes down to 1, in direction of gradient 0.0001

                    s.diag_l2 = torch.max(diag)  # 40 - 3000 smaller than kfac l2 for jac
                    s.diag_fro = torch.norm(
                        diag)  # jacobian grows to 0.5-1.5, rest falls, layer-5 has phase transition, layer-4 also
                    s.diag_trace = diag.sum()  # jacobian grows 0-1000 (first), 0-150 (last). Almost same as kfac_trace (771 vs 810 kfac). Jacobian has up/down phase transition
                    s.diag_average = diag.mean()

                    # normalize for mean loss
                    BB = stats.BB / n
                    AA = stats.AA / n
                    # A_evals, _ = torch.symeig(AA)   # averaging 120ms per hit, 90 hits
                    # B_evals, _ = torch.symeig(BB)

                    # s.kfac_l2 = torch.max(A_evals) * torch.max(B_evals)    # 60x larger than diag_l2. layer0/hess has down/up phase transition. layer5/jacobian has up/down phase transition
                    s.kfac_trace = torch.trace(AA) * torch.trace(BB)  # 0/hess down/up tr, 5/jac sharp phase transition
                    s.kfac_fro = torch.norm(stats.AA) * torch.norm(
                        stats.BB)  # 0/hess has down/up tr, 5/jac up/down transition
                    # s.kfac_erank = s.kfac_trace / s.kfac_l2   # first layer has 25, rest 15, all layers go down except last, last noisy
                    # s.kfac_erank_fro = s.kfac_trace / s.kfac_fro / max(stats.BA.shape)

                    s.diversity = (stats.norm2 / n) / u.norm_squared(
                        stats.BA / n)  # gradient diversity. Goes up 3x. Bottom layer has most diversity. Jacobian diversity much less noisy than everythingelse

                    # discrepancy of KFAC based on exact values of diagonal approximation
                    # average difference normalized by average diagonal magnitude
                    diag_kfac = torch.einsum('ll,ii->li', BB, AA)
                    s.kfac_error = (torch.abs(diag_kfac - diag)).mean() / torch.mean(diag.abs())
                    u.log_scalars(u.nest_stats(f'layer-{i}/{stats_name}', s))

                # openai batch size stat
                s = AttrDict()
                hess = hessians[layer]
                jac = jacobians[layer]
                fish = fishers[layer]
                quad_fish = quad_fishers[layer]

                # the following check passes, but is expensive
                # if args.stats_num_batches == 1:
                #    u.check_close(fisher[layer].BA, layer.weight.grad)

                def trsum(A, B):
                    return (A * B).sum()  # computes tr(AB')

                grad = fishers[layer].BA / n
                s.grad_fro = torch.norm(grad)

                # get norms
                s.lyap_hess_max = quad_fish.lyap_hess_max
                s.lyap_hess_ave = quad_fish.lyap_hess_sum / n
                s.lyap_jac_max = quad_fish.lyap_jac_max
                s.lyap_jac_ave = quad_fish.lyap_jac_sum / n
                s.hess_trace = hess.diag.sum() / n
                s.jac_trace = jac.diag.sum() / n

                # Version 1 of Jain stochastic rates, use Hessian for curvature
                b = args.train_batch_size

                s.hess_curv = trsum((hess.BB / n) @ grad @ (hess.AA / n), grad) / trsum(grad, grad)
                s.jac_curv = trsum((jac.BB / n) @ grad @ (jac.AA / n), grad) / trsum(grad, grad)

                # compute gradient noise statistics
                # fish.BB has /n factor twice, hence don't need extra /n on fish.AA
                # after sampling, hess_noise,jac_noise became 100x smaller, but normalized is unaffected
                s.hess_noise = (trsum(hess.AA / n, fish.AA / n) * trsum(hess.BB / n, fish.BB / n))
                s.jac_noise = (trsum(jac.AA / n, fish.AA / n) * trsum(jac.BB / n, fish.BB / n))
                s.hess_noise_centered = s.hess_noise - trsum(hess.BB / n @ grad, grad @ hess.AA / n)
                s.jac_noise_centered = s.jac_noise - trsum(jac.BB / n @ grad, grad @ jac.AA / n)
                s.openai_gradient_noise = (fish.norms_hess / n) / trsum(hess.BB / n @ grad, grad @ hess.AA / n)

                s.mean_norm = torch.sqrt(fish.norm2) / n
                s.min_norm = torch.sqrt(fish.min_norm2)
                s.median_norm = torch.sqrt(fish.median_norm2)
                s.max_norm = torch.sqrt(fish.max_norm2)
                s.enorms = u.norm_squared(grad)
                s.a_sparsity = fish.a_sparsity
                s.b_sparsity = fish.b_sparsity
                s.mean_activation = fish.mean_activation
                s.msr_activation = torch.sqrt(fish.mean_activation2)
                s.mean_backprop = fish.mean_backprop
                s.msr_backprop = torch.sqrt(fish.mean_backprop2)

                s.norms_centered = fish.norm2 / n - u.norm_squared(grad)
                s.norms_hess = fish.norms_hess / n
                s.norms_jac = fish.norms_jac / n

                s.hess_curv_grad = fish.curv_hess / n  # phase transition, hits minimum loss in layer 1, then starts going up. Other layers take longer to reach minimum. Decreases with depth.
                s.hess_curv_grad_max = fish.curv_hess_max   # phase transition, hits minimum loss in layer 1, then starts going up. Other layers take longer to reach minimum. Decreases with depth.
                s.hess_curv_grad_median = fish.curv_hess_median   # phase transition, hits minimum loss in layer 1, then starts going up. Other layers take longer to reach minimum. Decreases with depth.
                s.sigma_curv_grad = quad_fish.curv_sigma / n
                s.sigma_curv_grad_max = quad_fish.curv_sigma_max
                s.sigma_curv_grad_median = quad_fish.curv_sigma_median
                s.band_bottou = 0.5 * lr * s.sigma_curv_grad / s.hess_curv_grad
                s.band_bottou_stoch = 0.5 * lr * quad_fish.curv_ratio / n
                s.band_yaida = 0.25 * lr * s.mean_norm**2
                s.band_yaida_centered = 0.25 * lr * s.norms_centered

                s.jac_curv_grad = fish.curv_jac / n  # this one has much lower variance than jac_curv. Reaches peak at 10k steps, also kfac error reaches peak there. Decreases with depth except for last layer.
                s.jac_curv_grad_max = fish.curv_jac_max  # this one has much lower variance than jac_curv. Reaches peak at 10k steps, also kfac error reaches peak there. Decreases with depth except for last layer.
                s.jac_curv_grad_median = fish.curv_jac_median  # this one has much lower variance than jac_curv. Reaches peak at 10k steps, also kfac error reaches peak there. Decreases with depth except for last layer.

                # OpenAI gradient noise statistics
                s.hess_noise_normalized = s.hess_noise_centered / (fish.norms_hess / n)
                s.jac_noise_normalized = s.jac_noise / (fish.norms_jac / n)

                train_regrets_, test_regrets1_, test_regrets2_, train_regrets_opt_, test_regrets_opt_, cosines_, dot_products_ = (torch.stack(r[layer]) for r in (train_regrets, test_regrets1, test_regrets2, train_regrets_opt, test_regrets_opt, cosines, dot_products))
                s.train_regret = train_regrets_.median()  # use median because outliers make it hard to see the trend
                s.test_regret1 = test_regrets1_.median()
                s.test_regret2 = test_regrets2_.median()
                s.test_regret_opt = test_regrets_opt_.median()
                s.train_regret_opt = train_regrets_opt_.median()
                s.mean_dot_product = torch.mean(dot_products_)
                s.median_dot_product = torch.median(dot_products_)
                a = [1, 2, 3]

                s.median_cosine = cosines_.median()
                s.mean_cosine = cosines_.mean()

                # get learning rates
                L1 = s.hess_curv_grad / n
                L2 = s.jac_curv_grad / n
                diversity = (fish.norm2 / n) / u.norm_squared(grad)
                robust_diversity = (fish.norm2 / n) / fish.median_norm2
                dotprod_diversity = fish.median_norm2 / s.median_dot_product
                s.lr1 = 2 / (L1 * diversity)
                s.lr2 = 2 / (L2 * diversity)
                s.lr3 = 2 / (L2 * robust_diversity)
                s.lr4 = 2 / (L2 * dotprod_diversity)

                hess_A = u.symeig_pos_evals(hess.AA / n)
                hess_B = u.symeig_pos_evals(hess.BB / n)
                fish_A = u.symeig_pos_evals(fish.AA / n)
                fish_B = u.symeig_pos_evals(fish.BB / n)
                jac_A = u.symeig_pos_evals(jac.AA / n)
                jac_B = u.symeig_pos_evals(jac.BB / n)
                u.log_scalars({f'layer-{i}/hessA_erank': erank(hess_A)})
                u.log_scalars({f'layer-{i}/hessB_erank': erank(hess_B)})
                u.log_scalars({f'layer-{i}/fishA_erank': erank(fish_A)})
                u.log_scalars({f'layer-{i}/fishB_erank': erank(fish_B)})
                u.log_scalars({f'layer-{i}/jacA_erank': erank(jac_A)})
                u.log_scalars({f'layer-{i}/jacB_erank': erank(jac_B)})
                gl.event_writer.add_histogram(f'layer-{i}/hist_hess_eig', u.outer(hess_A, hess_B).flatten(), gl.get_global_step())
                gl.event_writer.add_histogram(f'layer-{i}/hist_fish_eig', u.outer(hess_A, hess_B).flatten(), gl.get_global_step())
                gl.event_writer.add_histogram(f'layer-{i}/hist_jac_eig', u.outer(hess_A, hess_B).flatten(), gl.get_global_step())

                s.hess_l2 = max(hess_A) * max(hess_B)
                s.jac_l2 = max(jac_A) * max(jac_B)
                s.fish_l2 = max(fish_A) * max(fish_B)
                s.hess_trace = hess.diag.sum() / n

                s.jain1_sto = 1/(s.hess_trace + 2 * s.hess_l2)
                s.jain1_det = 1/s.hess_l2

                s.jain1_lr = (1 / b) * (1/s.jain1_sto) + (b - 1) / b * (1/s.jain1_det)
                s.jain1_lr = 2 / s.jain1_lr

                s.regret_ratio = (
                            train_regrets_opt_ / test_regrets_opt_).median()  # ratio between train and test regret, large means overfitting
                u.log_scalars(u.nest_stats(f'layer-{i}', s))

                # compute stats that would let you bound rho
                if i == 0:  # only compute this once, for output layer
                    hhh = hessians[model.layers[-1]].BB / n
                    fff = fishers[model.layers[-1]].BB / n
                    d = fff.shape[0]
                    L = u.lyapunov_spectral(hhh, 2 * fff, cond=1e-8)
                    L_evals = u.symeig_pos_evals(L)
                    Lcheap = fff @ u.pinv(hhh, cond=1e-8)
                    Lcheap_evals = u.eig_real(Lcheap)

                    u.log_scalars({f'mismatch/rho': d/erank(L_evals)})
                    u.log_scalars({f'mismatch/rho_cheap': d/erank(Lcheap_evals)})
                    u.log_scalars({f'mismatch/diagonalizability': erank(L_evals)/erank(Lcheap_evals)})  # 1 means diagonalizable
                    u.log_spectrum(f'mismatch/sigma', u.symeig_pos_evals(fff), loglog=False)
                    u.log_spectrum(f'mismatch/hess', u.symeig_pos_evals(hhh), loglog=False)
                    u.log_spectrum(f'mismatch/lyapunov', L_evals, loglog=True)
                    u.log_spectrum(f'mismatch/lyapunov_cheap', Lcheap_evals, loglog=True)

                gl.event_writer.add_histogram(f'layer-{i}/hist_grad_norms', u.to_numpy(fishers_histograms[layer].norms.value()), gl.get_global_step())
                gl.event_writer.add_histogram(f'layer-{i}/hist_grad_norms_hess', u.to_numpy(fishers_histograms[layer].norms_hess.value()), gl.get_global_step())
                gl.event_writer.add_histogram(f'layer-{i}/hist_curv_jac', u.to_numpy(fishers_histograms[layer].curv_jac.value()), gl.get_global_step())
                gl.event_writer.add_histogram(f'layer-{i}/hist_curv_hess', u.to_numpy(fishers_histograms[layer].curv_hess.value()), gl.get_global_step())
                gl.event_writer.add_histogram(f'layer-{i}/hist_cosines', u.to_numpy(cosines[layer]), gl.get_global_step())

                if args.log_spectra:
                    with u.timeit('spectrum'):
                        # 2/alpha
                        # s.jain1_lr = (1 / b) * s.jain1_sto + (b - 1) / b * s.jain1_det
                        # s.jain1_lr = 1 / s.jain1_lr

                        # hess.diag_trace, jac.diag_trace

                        # Version 2 of Jain stochastic rates, use Jacobian squared for curvature
                        s.jain2_sto = s.lyap_jac_max * s.jac_trace / s.lyap_jac_ave
                        s.jain2_det = s.jac_l2
                        s.jain2_lr = (1 / b) * s.jain2_sto + (b - 1) / b * s.jain2_det
                        s.jain2_lr = 1 / s.jain2_lr

                        u.log_spectrum(f'layer-{i}/hess_A', hess_A)
                        u.log_spectrum(f'layer-{i}/hess_B', hess_B)
                        u.log_spectrum(f'layer-{i}/hess_AB', u.outer(hess_A, hess_B).flatten())
                        u.log_spectrum(f'layer-{i}/jac_A', jac_A)
                        u.log_spectrum(f'layer-{i}/jac_B', jac_B)
                        u.log_spectrum(f'layer-{i}/fish_A', fish_A)
                        u.log_spectrum(f'layer-{i}/fish_B', fish_B)

                        u.log_scalars({f'layer-{i}/trace_ratio': fish_B.sum()/hess_B.sum()})

                        L = torch.eig(u.lyapunov_spectral(hess.BB, 2*fish.BB, cond=1e-8))[0]
                        L = L[:, 0]  # extract real part
                        L = L.sort()[0]
                        L = torch.flip(L, [0])

                        L_cheap = torch.eig(fish.BB @ u.pinv(hess.BB, cond=1e-8))[0]
                        L_cheap = L_cheap[:, 0]  # extract real part
                        L_cheap = L_cheap.sort()[0]
                        L_cheap = torch.flip(L_cheap, [0])

                        d = len(hess_B)
                        u.log_spectrum(f'layer-{i}/Lyap', L)
                        u.log_spectrum(f'layer-{i}/Lyap_cheap', L_cheap)

                        u.log_scalars({f'layer-{i}/dims': d})
                        u.log_scalars({f'layer-{i}/L_erank': erank(L)})
                        u.log_scalars({f'layer-{i}/L_cheap_erank': erank(L_cheap)})

                        u.log_scalars({f'layer-{i}/rho': d/erank(L)})
                        u.log_scalars({f'layer-{i}/rho_cheap': d/erank(L_cheap)})

        model.train()
        with u.timeit('train'):
            for i in range(args.train_steps):
                optimizer.zero_grad()
                data, targets = next(train_iter)
                model.zero_grad()
                output = model(data)
                loss = loss_fn(output, targets)
                loss.backward()

                optimizer.step()
                if args.weight_decay:
                    for group in optimizer.param_groups:
                        for param in group['params']:
                            param.data.mul_(1 - args.weight_decay)

                gl.token_count += data.shape[0]

    gl.event_writer.close()
示例#22
0
                def compute_stats(layer, _, B):
                    A = activations[layer]
                    if current == fishers:
                        backprops[layer] = B

                    # about 27ms per layer
                    with u.timeit('compute_stats'):
                        current[layer].BB += torch.einsum("ni,nj->ij", B, B)  # TODO(y): index consistency
                        current[layer].diag += torch.einsum("ni,nj->ij", B * B, A * A)
                        current[layer].BA += torch.einsum("ni,nj->ij", B, A)
                        current[layer].a += torch.einsum("ni->i", A)
                        current[layer].b += torch.einsum("nk->k", B)
                        current[layer].norm2 += ((A * A).sum(dim=1) * (B * B).sum(dim=1)).sum()

                        # compute curvatures in direction of all gradiennts
                        if current is fishers:
                            assert args.stats_num_batches == 1, "not tested on more than one stats step, currently reusing aggregated moments"
                            hess = hessians[layer]
                            jac = jacobians[layer]
                            Bh, Ah = B @ hess.BB / n, A @ forward_stats[layer].AA / n
                            Bj, Aj = B @ jac.BB / n, A @ forward_stats[layer].AA / n
                            norms = ((A * A).sum(dim=1) * (B * B).sum(dim=1))

                            current[layer].min_norm2 = min(norms)
                            current[layer].median_norm2 = torch.median(norms)
                            current[layer].max_norm2 = max(norms)

                            norms2_hess = ((Ah * A).sum(dim=1) * (Bh * B).sum(dim=1))
                            norms2_jac = ((Aj * A).sum(dim=1) * (Bj * B).sum(dim=1))

                            current[layer].norm += norms.sum()
                            current_histograms[layer].norms.extend(torch.sqrt(norms))
                            current[layer].curv_hess += (skip_nans(norms2_hess / norms)).sum()
                            current_histograms[layer].curv_hess.extend(skip_nans(norms2_hess / norms))
                            current[layer].curv_hess_max += (skip_nans(norms2_hess / norms)).max()
                            current[layer].curv_hess_median += (skip_nans(norms2_hess / norms)).median()

                            current_histograms[layer].curv_jac.extend(skip_nans(norms2_jac / norms))
                            current[layer].curv_jac += (skip_nans(norms2_jac / norms)).sum()
                            current[layer].curv_jac_max += (skip_nans(norms2_jac / norms)).max()
                            current[layer].curv_jac_median += (skip_nans(norms2_jac / norms)).median()

                            current[layer].a_sparsity += torch.sum(A <= 0).float() / A.numel()
                            current[layer].b_sparsity += torch.sum(B <= 0).float() / B.numel()

                            current[layer].mean_activation += torch.mean(A)
                            current[layer].mean_activation2 += torch.mean(A*A)
                            current[layer].mean_backprop = torch.mean(B)
                            current[layer].mean_backprop2 = torch.mean(B*B)

                            current[layer].norms_hess += torch.sqrt(norms2_hess).sum()
                            current_histograms[layer].norms_hess.extend(torch.sqrt(norms2_hess))
                            current[layer].norms_jac += norms2_jac.sum()
                            current_histograms[layer].norms_jac.extend(torch.sqrt(norms2_jac))

                            normalized_moments = copy.copy(hessians[layer])
                            normalized_moments.AA = forward_stats[layer].AA
                            normalized_moments = u.divide_attributes(normalized_moments, n)

                            train_regrets_ = autograd_lib.offset_losses(A, B, alpha=lr, offset=0, m=normalized_moments,
                                                                        approx=args.curv)
                            test_regrets1_ = autograd_lib.offset_losses(A, B, alpha=lr, offset=1, m=normalized_moments,
                                                                        approx=args.curv)
                            test_regrets2_ = autograd_lib.offset_losses(A, B, alpha=lr, offset=2, m=normalized_moments,
                                                                        approx=args.curv)
                            test_regrets_opt_ = autograd_lib.offset_losses(A, B, alpha=None, offset=2,
                                                                           m=normalized_moments, approx=args.curv)
                            train_regrets_opt_ = autograd_lib.offset_losses(A, B, alpha=None, offset=0,
                                                                            m=normalized_moments, approx=args.curv)
                            cosines_ = autograd_lib.offset_cosines(A, B)
                            train_regrets[layer].extend(train_regrets_)
                            test_regrets1[layer].extend(test_regrets1_)
                            test_regrets2[layer].extend(test_regrets2_)
                            train_regrets_opt[layer].extend(train_regrets_opt_)
                            test_regrets_opt[layer].extend(test_regrets_opt_)
                            cosines[layer].extend(cosines_)
                            dot_products[layer].extend(autograd_lib.offset_dotprod(A, B))

                        # statistics of the form g.Sigma.g
                        elif current == quad_fishers:
                            hess = hessians[layer]
                            sigma = fishers[layer]
                            jac = jacobians[layer]
                            Bs, As = B @ sigma.BB / n, A @ forward_stats[layer].AA / n
                            Bh, Ah = B @ hess.BB / n, A @ forward_stats[layer].AA / n
                            Bj, Aj = B @ jac.BB / n, A @ forward_stats[layer].AA / n

                            norms = ((A * A).sum(dim=1) * (B * B).sum(dim=1))
                            norms2_hess = ((Ah * A).sum(dim=1) * (Bh * B).sum(dim=1))
                            norms2_jac = ((Aj * A).sum(dim=1) * (Bj * B).sum(dim=1))
                            norms_sigma = ((As * A).sum(dim=1) * (Bs * B).sum(dim=1))

                            current[layer].norm += norms.sum()  # TODO(y) remove, redundant with norm2 above
                            current[layer].curv_sigma += (skip_nans(norms_sigma / norms)).sum()
                            current[layer].curv_sigma_max = skip_nans(norms_sigma / norms).max()
                            current[layer].curv_sigma_median = skip_nans(norms_sigma / norms).median()
                            current[layer].curv_hess += skip_nans(norms2_hess / norms).sum()
                            current[layer].curv_hess_max += skip_nans(norms2_hess / norms).max()
                            current[layer].lyap_hess_mean += skip_nans(norms_sigma / norms2_hess).mean()
                            current[layer].lyap_hess_max = max(skip_nans(norms_sigma/norms2_hess))
                            current[layer].lyap_jac_mean += skip_nans(norms_sigma / norms2_jac).mean()
                            current[layer].lyap_jac_max = max(skip_nans(norms_sigma/norms2_jac))
示例#23
0
def main():
    attemp_count = 0
    while os.path.exists(f"{args.logdir}{attemp_count:02d}"):
        attemp_count += 1
    logdir = f"{args.logdir}{attemp_count:02d}"

    run_name = os.path.basename(logdir)
    gl.event_writer = SummaryWriter(logdir)
    print(f"Logging to {run_name}")
    u.seed_random(1)

    d1 = args.data_width**2
    d2 = 10
    d3 = args.targets_width**2
    o = d3
    n = args.stats_batch_size
    d = [d1, d2, d3]
    model = u.SimpleFullyConnected(d, nonlin=args.nonlin)
    model = model.to(gl.device)

    try:
        # os.environ['WANDB_SILENT'] = 'true'
        if args.wandb:
            wandb.init(project='curv_train_tiny', name=run_name)
            wandb.tensorboard.patch(tensorboardX=False)
            wandb.config['train_batch'] = args.train_batch_size
            wandb.config['stats_batch'] = args.stats_batch_size
            wandb.config['method'] = args.method
            wandb.config['d1'] = d1
            wandb.config['d2'] = d2
            wandb.config['d3'] = d3
            wandb.config['n'] = n
    except Exception as e:
        print(f"wandb crash with {e}")

    optimizer = torch.optim.SGD(model.parameters(), lr=0.03, momentum=0.9)

    dataset = u.TinyMNIST(data_width=args.data_width,
                          targets_width=args.targets_width,
                          dataset_size=args.dataset_size)

    train_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=args.train_batch_size,
        shuffle=False,
        drop_last=True)
    train_iter = u.infinite_iter(train_loader)

    stats_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=args.stats_batch_size,
        shuffle=False,
        drop_last=True)
    stats_iter = u.infinite_iter(stats_loader)

    test_dataset = u.TinyMNIST(data_width=args.data_width,
                               targets_width=args.targets_width,
                               dataset_size=args.dataset_size,
                               train=False)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.stats_batch_size,
                                              shuffle=True,
                                              drop_last=True)
    test_iter = u.infinite_iter(test_loader)

    skip_forward_hooks = False
    skip_backward_hooks = False

    def capture_activations(module: nn.Module, input: List[torch.Tensor],
                            output: torch.Tensor):
        if skip_forward_hooks:
            return
        assert not hasattr(
            module, 'activations'
        ), "Seeing results of previous autograd, call util.zero_grad to clear"
        assert len(input) == 1, "this was tested for single input layers only"
        setattr(module, "activations", input[0].detach())
        setattr(module, "output", output.detach())

    def capture_backprops(module: nn.Module, _input, output):
        if skip_backward_hooks:
            return
        assert len(output) == 1, "this works for single variable layers only"
        if gl.backward_idx == 0:
            assert not hasattr(
                module, 'backprops'
            ), "Seeing results of previous autograd, call util.zero_grad to clear"
            setattr(module, 'backprops', [])
        assert gl.backward_idx == len(module.backprops)
        module.backprops.append(output[0])

    def save_grad(param: nn.Parameter) -> Callable[[torch.Tensor], None]:
        """Hook to save gradient into 'param.saved_grad', so it can be accessed after model.zero_grad(). Only stores gradient
        if the value has not been set, call util.zero_grad to clear it."""
        def save_grad_fn(grad):
            if not hasattr(param, 'saved_grad'):
                setattr(param, 'saved_grad', grad)

        return save_grad_fn

    for layer in model.layers:
        layer.register_forward_hook(capture_activations)
        layer.register_backward_hook(capture_backprops)
        layer.weight.register_hook(save_grad(layer.weight))

    def loss_fn(data, targets):
        err = data - targets.view(-1, data.shape[1])
        assert len(data) == len(targets)
        return torch.sum(err * err) / 2 / len(data)

    gl.token_count = 0
    last_outer = 0
    for step in range(args.stats_steps):
        if last_outer:
            u.log_scalars(
                {"time/outer": 1000 * (time.perf_counter() - last_outer)})
        last_outer = time.perf_counter()
        # compute validation loss
        skip_forward_hooks = True
        skip_backward_hooks = True
        with u.timeit("val_loss"):
            test_data, test_targets = next(test_iter)
            test_output = model(test_data)
            val_loss = loss_fn(test_output, test_targets)
            print("val_loss", val_loss.item())
            u.log_scalar(val_loss=val_loss.item())

        # compute stats
        data, targets = next(stats_iter)
        skip_forward_hooks = False
        skip_backward_hooks = False

        # get gradient values
        with u.timeit("backprop_g"):
            gl.backward_idx = 0
            u.zero_grad(model)
            output = model(data)
            loss = loss_fn(output, targets)
            loss.backward(retain_graph=True)

        # get Hessian values
        skip_forward_hooks = True
        id_mat = torch.eye(o).to(gl.device)

        u.log_scalar(loss=loss.item())

        with u.timeit("backprop_H"):
            # optionally use randomized low-rank approximation of Hessian
            hess_rank = args.hess_samples if args.hess_samples else o

            for out_idx in range(hess_rank):
                model.zero_grad()
                # backprop to get section of batch output jacobian for output at position out_idx
                output = model(
                    data
                )  # opt: using autograd.grad means I don't have to zero_grad
                if args.hess_samples:
                    bval = torch.LongTensor(n, o).to(gl.device).random_(
                        0, 2) * 2 - 1
                    bval = bval.float()
                else:
                    ei = id_mat[out_idx]
                    bval = torch.stack([ei] * n)
                gl.backward_idx = out_idx + 1
                output.backward(bval)
            skip_backward_hooks = True  #

        for (i, layer) in enumerate(model.layers):
            s = AttrDefault(str, {})  # dictionary-like object for layer stats

            #############################
            # Gradient stats
            #############################
            A_t = layer.activations
            assert A_t.shape == (n, d[i])

            # add factor of n because backprop takes loss averaged over batch, while we need per-example loss
            B_t = layer.backprops[0] * n
            assert B_t.shape == (n, d[i + 1])

            with u.timeit(f"khatri_g-{i}"):
                G = u.khatri_rao_t(B_t, A_t)  # batch loss Jacobian
            assert G.shape == (n, d[i] * d[i + 1])
            g = G.sum(dim=0, keepdim=True) / n  # average gradient
            assert g.shape == (1, d[i] * d[i + 1])

            if args.autograd_check:
                u.check_close(B_t.t() @ A_t / n, layer.weight.saved_grad)
                u.check_close(g.reshape(d[i + 1], d[i]),
                              layer.weight.saved_grad)

            s.sparsity = torch.sum(layer.output <= 0) / layer.output.numel()
            s.mean_activation = torch.mean(A_t)
            s.mean_backprop = torch.mean(B_t)

            # empirical Fisher
            with u.timeit(f'sigma-{i}'):
                efisher = G.t() @ G / n
                sigma = efisher - g.t() @ g
                s.sigma_l2 = u.sym_l2_norm(sigma)
                s.sigma_erank = torch.trace(sigma) / s.sigma_l2

            #############################
            # Hessian stats
            #############################
            A_t = layer.activations
            Bh_t = [
                layer.backprops[out_idx + 1] for out_idx in range(hess_rank)
            ]
            Amat_t = torch.cat([A_t] * hess_rank, dim=0)
            Bmat_t = torch.cat(Bh_t, dim=0)

            assert Amat_t.shape == (n * hess_rank, d[i])
            assert Bmat_t.shape == (n * hess_rank, d[i + 1])

            lambda_regularizer = args.lmb * torch.eye(d[i] * d[i + 1]).to(
                gl.device)
            with u.timeit(f"khatri_H-{i}"):
                Jb = u.khatri_rao_t(
                    Bmat_t, Amat_t)  # batch Jacobian, in row-vec format

            with u.timeit(f"H-{i}"):
                H = Jb.t() @ Jb / n

            with u.timeit(f"invH-{i}"):
                invH = torch.cholesky_inverse(H + lambda_regularizer)

            with u.timeit(f"H_l2-{i}"):
                s.H_l2 = u.sym_l2_norm(H)
                s.iH_l2 = u.sym_l2_norm(invH)

            with u.timeit(f"norms-{i}"):
                s.H_fro = H.flatten().norm()
                s.iH_fro = invH.flatten().norm()
                s.jacobian_fro = Jb.flatten().norm()
                s.grad_fro = g.flatten().norm()
                s.param_fro = layer.weight.data.flatten().norm()

            u.nan_check(H)
            if args.autograd_check:
                model.zero_grad()
                output = model(data)
                loss = loss_fn(output, targets)
                H_autograd = u.hessian(loss, layer.weight)
                H_autograd = H_autograd.reshape(d[i] * d[i + 1],
                                                d[i] * d[i + 1])
                u.check_close(H, H_autograd)

            #  u.dump(sigma, f'/tmp/sigmas/H-{step}-{i}')
            def loss_direction(dd: torch.Tensor, eps):
                """loss improvement if we take step eps in direction dd"""
                return u.to_python_scalar(eps * (dd @ g.t()) -
                                          0.5 * eps**2 * dd @ H @ dd.t())

            def curv_direction(dd: torch.Tensor):
                """Curvature in direction dd"""
                return u.to_python_scalar(dd @ H @ dd.t() /
                                          (dd.flatten().norm()**2))

            with u.timeit("pinvH"):
                pinvH = u.pinv(H)

            with u.timeit(f'curv-{i}'):
                s.regret_newton = u.to_python_scalar(g @ pinvH @ g.t() / 2)
                s.grad_curv = curv_direction(g)
                ndir = g @ pinvH  # newton direction
                s.newton_curv = curv_direction(ndir)
                setattr(layer.weight, 'pre',
                        pinvH)  # save Newton preconditioner
                s.step_openai = 1 / s.grad_curv if s.grad_curv else 999
                s.step_max = 2 / u.sym_l2_norm(H)
                s.step_min = torch.tensor(2) / torch.trace(H)

                s.newton_fro = ndir.flatten().norm(
                )  # frobenius norm of Newton update
                s.regret_gradient = loss_direction(g, s.step_openai)

            with u.timeit(f'rho-{i}'):
                p_sigma = u.lyapunov_svd(H, sigma)
                if u.has_nan(
                        p_sigma) and args.compute_rho:  # use expensive method
                    H0 = H.cpu().detach().numpy()
                    sigma0 = sigma.cpu().detach().numpy()
                    p_sigma = scipy.linalg.solve_lyapunov(H0, sigma0)
                    p_sigma = torch.tensor(p_sigma).to(gl.device)

                if u.has_nan(p_sigma):
                    s.psigma_erank = H.shape[0]
                    s.rho = 1
                else:
                    s.psigma_erank = u.sym_erank(p_sigma)
                    s.rho = H.shape[0] / s.psigma_erank

            with u.timeit(f"batch-{i}"):
                s.batch_openai = torch.trace(H @ sigma) / (g @ H @ g.t())
                print('openai batch', s.batch_openai)
                s.diversity = torch.norm(G, "fro")**2 / torch.norm(g)**2

                # s.noise_variance = torch.trace(H.inverse() @ sigma)
                # try:
                #     # this fails with singular sigma
                #     s.noise_variance = torch.trace(torch.solve(sigma, H)[0])
                #     # s.noise_variance = torch.trace(torch.lstsq(sigma, H)[0])
                #     pass
                # except RuntimeError as _:
                s.noise_variance_pinv = torch.trace(pinvH @ sigma)

                s.H_erank = torch.trace(H) / s.H_l2
                s.batch_jain_simple = 1 + s.H_erank
                s.batch_jain_full = 1 + s.rho * s.H_erank

            u.log_scalars(u.nest_stats(layer.name, s))

        # gradient steps
        last_inner = 0
        for i in range(args.train_steps):
            if last_inner:
                u.log_scalars(
                    {"time/inner": 1000 * (time.perf_counter() - last_inner)})
            last_inner = time.perf_counter()

            optimizer.zero_grad()
            data, targets = next(train_iter)
            model.zero_grad()
            output = model(data)
            loss = loss_fn(output, targets)
            loss.backward()

            u.log_scalar(train_loss=loss.item())

            if args.method != 'newton':
                optimizer.step()
            else:
                for (layer_idx, layer) in enumerate(model.layers):
                    param: torch.nn.Parameter = layer.weight
                    param_data: torch.Tensor = param.data
                    param_data.copy_(param_data - 0.1 * param.grad)
                    if layer_idx != 1:  # only update 1 layer with Newton, unstable otherwise
                        continue
                    u.nan_check(layer.weight.pre)
                    u.nan_check(param.grad.flatten())
                    u.nan_check(u.v2r(param.grad.flatten()) @ layer.weight.pre)
                    param_new_flat = u.v2r(param_data.flatten()) - u.v2r(
                        param.grad.flatten()) @ layer.weight.pre
                    u.nan_check(param_new_flat)
                    param_data.copy_(param_new_flat.reshape(param_data.shape))

            gl.token_count += data.shape[0]

    gl.event_writer.close()
示例#24
0
def main(opts, cluster_spec, cfg):
    util.log(author='%s:%d' % (opts.job_name, opts.task_index),
             msg='starting @ %s' % util.get_hostname(expensive=True))
    cluster = tf.train.ClusterSpec(cluster_spec)
    server = tf.train.Server(cluster,
                             job_name=opts.job_name,
                             task_index=opts.task_index)

    if opts.job_name == "ps":
        util.log(author='%s:%d' % (opts.job_name, opts.task_index),
                 msg='joining server')
        server.join()
        raise Exception("Expecting server.join() to block forever")

    assert (opts.job_name == "worker")
    is_chief = (opts.task_index == 0)

    outqueue = Queue()
    train_post_thread = TrainPostThread(cfg, outqueue)
    train_post_thread.start()

    with tf.device("/job:ps/task:0"):
        params = NeuroSATParams(cfg=cfg)

    with tf.device(
            tf.train.replica_device_setter(
                worker_device="/job:worker/task:%d" % opts.task_index,
                cluster=cluster)):
        filenames = [
            os.path.join(util.gd_tfr_dir(gd_id=cfg['gd_id']), x)
            for x in os.listdir(util.gd_tfr_dir(gd_id=cfg['gd_id']))
        ]
        dataset = tf.data.TFRecordDataset(
            filenames=filenames,
            compression_type="GZIP",
            num_parallel_reads=cfg['n_parallel_reads'])
        # TODO(dselsam): don't hardcode the number of shards
        # idea: extend cluster_spec to map (job, task) -> (n_shards, shard_idx)
        dataset = dataset.shard(num_shards=4, index=opts.task_index % 4)
        dataset = dataset.map(example_to_tftd,
                              num_parallel_calls=cfg['n_parallel_calls'])
        dataset = dataset.filter(
            lambda tftd: 2 * tftd.n_vars + tftd.n_clauses < cfg['max_n_nodes'])
        dataset = dataset.repeat()
        dataset = dataset.prefetch(cfg['n_prefetch'])

        tftd = dataset.make_one_shot_iterator().get_next()

        args = NeuroSATArgs(n_vars=tftd.n_vars,
                            n_clauses=tftd.n_clauses,
                            CL_idxs=tftd.CL_idxs)
        guesses = apply_neurosat(cfg=cfg, params=params, args=args)

        pi_v_targets = tf.cast(tftd.core_var_mask, tf.float32)
        pi_v_targets = pi_v_targets / tf.reduce_sum(pi_v_targets)

        pi_c_targets = tf.cast(tftd.core_clause_mask, tf.float32)
        pi_c_targets = pi_c_targets / tf.reduce_sum(pi_c_targets)

        cv_loss = cfg['cv_loss_scale'] * tfutil.kldiv(
            logits=guesses.pi_core_var_logits, labels=pi_v_targets)
        cc_loss = cfg['cc_loss_scale'] * tfutil.kldiv(
            logits=guesses.pi_core_clause_logits, labels=pi_c_targets)
        l2_loss = cfg['l2_loss_scale'] * tfutil.build_l2_loss()
        loss = cv_loss + cc_loss + l2_loss

        stats = Stats(dp_id=tftd.dp_id,
                      cv_loss=cv_loss,
                      cc_loss=cc_loss,
                      l2_loss=l2_loss)

        global_step = tf.train.get_or_create_global_step()
        learning_rate = tfutil.build_learning_rate(cfg, global_step)

        apply_grads = tf.cond(
            tftd.is_train, lambda: tfutil.build_apply_gradients(
                cfg, loss, learning_rate, global_step), lambda: True)

    util.log(author='%s:%d' % (opts.job_name, opts.task_index),
             msg='creating session (train_id=%d)...' % cfg['train_id'])
    with tf.train.MonitoredTrainingSession(
            master=server.target,
            is_chief=is_chief,
            checkpoint_dir=util.checkpoint_dir(
                train_id=cfg['train_id'])) as mon_sess:
        util.log(author='%s:%d' % (opts.job_name, opts.task_index),
                 msg='starting session loop')
        step = 0
        while True:
            try:
                (_, stats_v), n_secs = util.timeit(mon_sess.run,
                                                   [apply_grads, stats])
                outqueue.put(
                    tuple(map(util.de_numpify, stats_v)) +
                    (cfg['train_id'], n_secs))
                step += 1
            except tf.errors.ResourceExhaustedError as e:
                tb = traceback.format_exc()
                util.log(kind='error',
                         author='train',
                         msg="RESOURCE_EXHAUSTED\n%s\n%s" % (str(e), tb))
                util.db_insert(table='tune_ooms', train_id=cfg['train_id'])
            except tf.errors.OpError as e:
                tb = traceback.format_exc()
                util.log(kind='error',
                         author='train',
                         msg="OP_ERROR\n%s\n%s" % (str(e), tb))
            except Exception as e:
                tb = traceback.format_exc()
                util.log(kind='error',
                         author='train',
                         msg="EXCEPTION\n%s\n%s" % (str(e), tb))
            except:
                tb = traceback.format_exc()
                util.log(kind='error',
                         author='train',
                         msg="UNKNOWN\n%s" % (tb))
示例#25
0
def main():
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    device = torch.device("cuda" if use_cuda else "cpu")
    print("using device ", device)
    torch.manual_seed(args.seed)

    u.set_runs_directory('runs3')
    logger = u.TensorboardLogger(args.run)
    batch_size = 64
    shuffle = True
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '/tmp/data',
        train=True,
        download=True,
        transform=transforms.Compose([transforms.ToTensor()])),
                                               batch_size=batch_size,
                                               shuffle=shuffle,
                                               **kwargs)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '/tmp/data',
        train=False,
        transform=transforms.Compose([transforms.ToTensor()])),
                                              batch_size=1000,
                                              shuffle=shuffle,
                                              **kwargs)
    """input image size for the original LeNet5 is 32x32, here is 28x28"""

    #  W1 = 0.1 * torch.randn(1 * 5 * 5 + 1, 6)

    net = LeNet5().to(device)

    def train_loss(data, target):
        y = net(data)
        y = F.log_softmax(y, dim=1)
        loss = F.nll_loss(y, target)
        for w in net.W:
            loss += 0.0002 * torch.sum(w * w)

        return loss

    def test_loss():
        num_errs = 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)

                y = net(data)
                _, pred = torch.max(y, dim=1)
                num_errs += torch.sum(pred != target)

        return num_errs.item() / len(test_loader.dataset)

    Qs = [[torch.eye(w.shape[0]), torch.eye(w.shape[1])] for w in net.W]
    for i in range(len(Qs)):
        for j in range(len(Qs[i])):
            Qs[i][j] = Qs[i][j].to(device)

    step_size = 0.1  # tried 0.15, diverges
    grad_norm_clip_thr = 1e10
    TrainLoss, TestLoss = [], []
    example_count = 0
    step_time_ms = 0

    for epoch in range(10):
        for batch_idx, (data, target) in enumerate(train_loader):
            step_start = time.perf_counter()
            data, target = data.to(device), target.to(device)

            loss = train_loss(data, target)

            with u.timeit('grad'):
                grads = autograd.grad(loss, net.W, create_graph=True)
            TrainLoss.append(loss.item())
            logger.set_step(example_count)
            logger('loss/train', TrainLoss[-1])
            if batch_idx % 10 == 0:
                print(
                    f'Epoch: {epoch}; batch: {batch_idx}; train loss: {TrainLoss[-1]:.2f}, step time: {step_time_ms:.0f}'
                )

            with u.timeit('Hv'):
                #        noise.normal_()
                # torch.manual_seed(args.seed)
                v = [torch.randn(w.shape).to(device) for w in net.W]
                # v = grads
                Hv = autograd.grad(grads, net.W, v)

            if args.verbose:
                print("v", v[0].mean())
                print("data", data.mean())
                print("Hv", Hv[0].mean())

            n = len(net.W)
            with torch.no_grad():
                with u.timeit('P_update'):
                    for i in range(num_updates):
                        psteps = []
                        for j in range(n):
                            q = Qs[j]
                            dw = v[j]
                            dg = Hv[j]
                            Qs[j][0], Qs[j][
                                1], pstep = psgd.update_precond_kron_with_step(
                                    q[0], q[1], dw, dg)
                            psteps.append(pstep)

                            #          print(np.array(psteps).mean())
                    logger('p_residual', np.array(psteps).mean())

                with u.timeit('g_update'):
                    pre_grads = [
                        psgd.precond_grad_kron(q[0], q[1], g)
                        for (q, g) in zip(Qs, grads)
                    ]
                    grad_norm = torch.sqrt(
                        sum([torch.sum(g * g) for g in pre_grads]))

                with u.timeit('gradstep'):
                    step_adjust = min(
                        grad_norm_clip_thr / (grad_norm + 1.2e-38), 1.0)
                    for i in range(len(net.W)):
                        net.W[i] -= step_adjust * step_size * pre_grads[i]

                total_step = step_adjust * step_size
                logger('step/adjust', step_adjust)
                logger('step/size', step_size)
                logger('step/total', total_step)
                logger('grad_norm', grad_norm)

                if args.verbose:
                    print(data.mean())
                    import pdb
                    pdb.set_trace()
                if args.early_stop:
                    sys.exit()

            example_count += batch_size
            step_time_ms = 1000 * (time.perf_counter() - step_start)
            logger('time/step', step_time_ms)

            if args.test and batch_idx >= 100:
                break
        if args.test and batch_idx >= 100:
            break

        test_loss0 = test_loss()
        TestLoss.append(test_loss0)
        logger('loss/test', test_loss0)
        step_size = (0.1**0.1) * step_size
        print('Epoch: {}; best test loss: {}'.format(epoch, min(TestLoss)))

    if args.test:
        step_times = logger.d['time/step']
        assert step_times[-1] < 30, step_times  # should be around 20ms
        losses = logger.d['loss/train']
        assert losses[0] > 2  # around 2.3887393474578857
        assert losses[-1] < 0.5, losses
        print("Test passed")
    return pi

#========== do a benchmark run ==============
def run():
    for cpus in range (1, multiprocessing.cpu_count()+1):
        timeit(partial(calculate_pi, cpus, N), "multiprocessing with " + str(cpus) + " CPUs")

#========== Main ==========
if __name__=='__main__':
    #Define n and number of CPUs
    pi = 0
    cpus = multiprocessing.cpu_count()
    print("CPU count: " + str(cpus))
    print("n = " +  str(N))
    print("--------------------------")
    print("--------------------------")

    #Estimate Pi
    for i in range(1, cpus+1):
        print("Using " + str(i) + " CPU(s)")
        start = timer()
        #calc = calculate_pi(i+1, N)
        pi += timeit(partial(calculate_pi, i, N), "multiprocessing")
        end = timer()
        print("Time: " + str(end - start) + "s")
        print("--------------------------")

    print("--------------------------")
    print("Pi: " +  str(pi/cpus))
    print("Error: " +  str(abs(pi/cpus - math.pi)))
示例#27
0
    for _ in tqdm(range(num_games // 2)):
        game = Reversi()
        mcts = MCTS(game)
        # mcts.run(20)
        for step in range(70):
            if step % 2 == 0:
                move = alphabeta_move(game, depth)
            else:
                mcts.run(iterations)
                move = mcts.get_move()
            # mcts.make_move(move)
            game.move(move)
            mcts = MCTS(game)
            if game.terminal():
                winner = game.winner()
                if winner == 0:
                    mcts_wins_as_second += 1
                elif winner == 0.5:
                    draws += 1
                break
    print('winrate: {}+{}/{}, draws: {}'.format(mcts_wins_as_first,
                                                mcts_wins_as_second, num_games,
                                                draws))


if __name__ == '__main__':
    timeit('START')
    play_against_random(40, 200)
    # play_against_alphabeta(20, 400, 3)
    timeit('SHOW')
def run():
    for cpus in range (1, multiprocessing.cpu_count()+1):
        timeit(partial(calculate_pi, cpus, N), "multiprocessing with " + str(cpus) + " CPUs")
示例#29
0
 def stats_runner_run():
   while(True):
     with u.timeit("kfac_update"):
       self.model.advance_batch()
       self.update_stats()
示例#30
0
def main():

    u.seed_random(1)
    logdir = u.create_local_logdir(args.logdir)
    run_name = os.path.basename(logdir)
    gl.event_writer = SummaryWriter(logdir)
    print(f"Logging to {run_name}")

    d1 = args.data_width ** 2
    assert args.data_width == args.targets_width
    o = d1
    n = args.stats_batch_size
    d = [d1, 30, 30, 30, 20, 30, 30, 30, d1]

    # small values for debugging
    # loss_type = 'LeastSquares'
    loss_type = 'CrossEntropy'

    args.wandb = 0
    args.stats_steps = 10
    args.train_steps = 10
    args.stats_batch_size = 10
    args.data_width = 2
    args.targets_width = 2
    args.nonlin = False
    d1 = args.data_width ** 2
    d2 = 2
    d3 = args.targets_width ** 2

    if loss_type == 'CrossEntropy':
        d3 = 10
    o = d3
    n = args.stats_batch_size
    d = [d1, d2, d3]
    dsize = max(args.train_batch_size, args.stats_batch_size)+1

    model = u.SimpleFullyConnected2(d, bias=True, nonlin=args.nonlin)
    model = model.to(gl.device)

    try:
        # os.environ['WANDB_SILENT'] = 'true'
        if args.wandb:
            wandb.init(project='curv_train_tiny', name=run_name)
            wandb.tensorboard.patch(tensorboardX=False)
            wandb.config['train_batch'] = args.train_batch_size
            wandb.config['stats_batch'] = args.stats_batch_size
            wandb.config['method'] = args.method
            wandb.config['n'] = n
    except Exception as e:
        print(f"wandb crash with {e}")

    #optimizer = torch.optim.SGD(model.parameters(), lr=0.03, momentum=0.9)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.03)  # make 10x smaller for least-squares loss
    dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, dataset_size=dsize, original_targets=True)

    train_loader = torch.utils.data.DataLoader(dataset, batch_size=args.train_batch_size, shuffle=False, drop_last=True)
    train_iter = u.infinite_iter(train_loader)

    stats_iter = None
    if not args.full_batch:
        stats_loader = torch.utils.data.DataLoader(dataset, batch_size=args.stats_batch_size, shuffle=False, drop_last=True)
        stats_iter = u.infinite_iter(stats_loader)

    test_dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, train=False, dataset_size=dsize, original_targets=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.train_batch_size, shuffle=False, drop_last=True)
    test_iter = u.infinite_iter(test_loader)

    if loss_type == 'LeastSquares':
        loss_fn = u.least_squares
    elif loss_type == 'CrossEntropy':
        loss_fn = nn.CrossEntropyLoss()

    autograd_lib.add_hooks(model)
    gl.token_count = 0
    last_outer = 0
    val_losses = []
    for step in range(args.stats_steps):
        if last_outer:
            u.log_scalars({"time/outer": 1000*(time.perf_counter() - last_outer)})
        last_outer = time.perf_counter()

        with u.timeit("val_loss"):
            test_data, test_targets = next(test_iter)
            test_output = model(test_data)
            val_loss = loss_fn(test_output, test_targets)
            print("val_loss", val_loss.item())
            val_losses.append(val_loss.item())
            u.log_scalar(val_loss=val_loss.item())

        # compute stats
        if args.full_batch:
            data, targets = dataset.data, dataset.targets
        else:
            data, targets = next(stats_iter)

        # Capture Hessian and gradient stats
        autograd_lib.enable_hooks()
        autograd_lib.clear_backprops(model)
        autograd_lib.clear_hess_backprops(model)
        with u.timeit("backprop_g"):
            output = model(data)
            loss = loss_fn(output, targets)
            loss.backward(retain_graph=True)
        with u.timeit("backprop_H"):
            autograd_lib.backprop_hess(output, hess_type=loss_type)
        autograd_lib.disable_hooks()   # TODO(y): use remove_hooks

        with u.timeit("compute_grad1"):
            autograd_lib.compute_grad1(model)
        with u.timeit("compute_hess"):
            autograd_lib.compute_hess(model)

        for (i, layer) in enumerate(model.layers):

            # input/output layers are unreasonably expensive if not using Kronecker factoring
            if d[i]>50 or d[i+1]>50:
                print(f'layer {i} is too big ({d[i],d[i+1]}), skipping stats')
                continue

            if args.skip_stats:
                continue

            s = AttrDefault(str, {})  # dictionary-like object for layer stats

            #############################
            # Gradient stats
            #############################
            A_t = layer.activations
            assert A_t.shape == (n, d[i])

            # add factor of n because backprop takes loss averaged over batch, while we need per-example loss
            B_t = layer.backprops_list[0] * n
            assert B_t.shape == (n, d[i + 1])

            with u.timeit(f"khatri_g-{i}"):
                G = u.khatri_rao_t(B_t, A_t)  # batch loss Jacobian
            assert G.shape == (n, d[i] * d[i + 1])
            g = G.sum(dim=0, keepdim=True) / n  # average gradient
            assert g.shape == (1, d[i] * d[i + 1])

            u.check_equal(G.reshape(layer.weight.grad1.shape), layer.weight.grad1)

            if args.autograd_check:
                u.check_close(B_t.t() @ A_t / n, layer.weight.saved_grad)
                u.check_close(g.reshape(d[i + 1], d[i]), layer.weight.saved_grad)

            s.sparsity = torch.sum(layer.output <= 0) / layer.output.numel()  # proportion of activations that are zero
            s.mean_activation = torch.mean(A_t)
            s.mean_backprop = torch.mean(B_t)

            # empirical Fisher
            with u.timeit(f'sigma-{i}'):
                efisher = G.t() @ G / n
                sigma = efisher - g.t() @ g
                s.sigma_l2 = u.sym_l2_norm(sigma)
                s.sigma_erank = torch.trace(sigma)/s.sigma_l2

            lambda_regularizer = args.lmb * torch.eye(d[i + 1]*d[i]).to(gl.device)
            H = layer.weight.hess

            with u.timeit(f"invH-{i}"):
                invH = torch.cholesky_inverse(H+lambda_regularizer)

            with u.timeit(f"H_l2-{i}"):
                s.H_l2 = u.sym_l2_norm(H)
                s.iH_l2 = u.sym_l2_norm(invH)

            with u.timeit(f"norms-{i}"):
                s.H_fro = H.flatten().norm()
                s.iH_fro = invH.flatten().norm()
                s.grad_fro = g.flatten().norm()
                s.param_fro = layer.weight.data.flatten().norm()

            u.nan_check(H)
            if args.autograd_check:
                model.zero_grad()
                output = model(data)
                loss = loss_fn(output, targets)
                H_autograd = u.hessian(loss, layer.weight)
                H_autograd = H_autograd.reshape(d[i] * d[i + 1], d[i] * d[i + 1])
                u.check_close(H, H_autograd)

            #  u.dump(sigma, f'/tmp/sigmas/H-{step}-{i}')
            def loss_direction(dd: torch.Tensor, eps):
                """loss improvement if we take step eps in direction dd"""
                return u.to_python_scalar(eps * (dd @ g.t()) - 0.5 * eps ** 2 * dd @ H @ dd.t())

            def curv_direction(dd: torch.Tensor):
                """Curvature in direction dd"""
                return u.to_python_scalar(dd @ H @ dd.t() / (dd.flatten().norm() ** 2))

            with u.timeit(f"pinvH-{i}"):
                pinvH = u.pinv(H)

            with u.timeit(f'curv-{i}'):
                s.grad_curv = curv_direction(g)
                ndir = g @ pinvH  # newton direction
                s.newton_curv = curv_direction(ndir)
                setattr(layer.weight, 'pre', pinvH)  # save Newton preconditioner
                s.step_openai = s.grad_fro**2 / s.grad_curv if s.grad_curv else 999
                s.step_max = 2 / s.H_l2
                s.step_min = torch.tensor(2) / torch.trace(H)

                s.newton_fro = ndir.flatten().norm()  # frobenius norm of Newton update
                s.regret_newton = u.to_python_scalar(g @ pinvH @ g.t() / 2)   # replace with "quadratic_form"
                s.regret_gradient = loss_direction(g, s.step_openai)

            with u.timeit(f'rho-{i}'):
                p_sigma = u.lyapunov_svd(H, sigma)
                if u.has_nan(p_sigma) and args.compute_rho:  # use expensive method
                    print('using expensive method')
                    import pdb; pdb.set_trace()
                    H0, sigma0 = u.to_numpys(H, sigma)
                    p_sigma = scipy.linalg.solve_lyapunov(H0, sigma0)
                    p_sigma = torch.tensor(p_sigma).to(gl.device)

                if u.has_nan(p_sigma):
                    # import pdb; pdb.set_trace()
                    s.psigma_erank = H.shape[0]
                    s.rho = 1
                else:
                    s.psigma_erank = u.sym_erank(p_sigma)
                    s.rho = H.shape[0] / s.psigma_erank

            with u.timeit(f"batch-{i}"):
                s.batch_openai = torch.trace(H @ sigma) / (g @ H @ g.t())
                s.diversity = torch.norm(G, "fro") ** 2 / torch.norm(g) ** 2 / n

                # Faster approaches for noise variance computation
                # s.noise_variance = torch.trace(H.inverse() @ sigma)
                # try:
                #     # this fails with singular sigma
                #     s.noise_variance = torch.trace(torch.solve(sigma, H)[0])
                #     # s.noise_variance = torch.trace(torch.lstsq(sigma, H)[0])
                #     pass
                # except RuntimeError as _:
                s.noise_variance_pinv = torch.trace(pinvH @ sigma)

                s.H_erank = torch.trace(H) / s.H_l2
                s.batch_jain_simple = 1 + s.H_erank
                s.batch_jain_full = 1 + s.rho * s.H_erank

            u.log_scalars(u.nest_stats(layer.name, s))

        # gradient steps
        with u.timeit('inner'):
            for i in range(args.train_steps):
                optimizer.zero_grad()
                data, targets = next(train_iter)
                model.zero_grad()
                output = model(data)
                loss = loss_fn(output, targets)
                loss.backward()

                #            u.log_scalar(train_loss=loss.item())

                if args.method != 'newton':
                    optimizer.step()
                    if args.weight_decay:
                        for group in optimizer.param_groups:
                            for param in group['params']:
                                param.data.mul_(1-args.weight_decay)
                else:
                    for (layer_idx, layer) in enumerate(model.layers):
                        param: torch.nn.Parameter = layer.weight
                        param_data: torch.Tensor = param.data
                        param_data.copy_(param_data - 0.1 * param.grad)
                        if layer_idx != 1:  # only update 1 layer with Newton, unstable otherwise
                            continue
                        u.nan_check(layer.weight.pre)
                        u.nan_check(param.grad.flatten())
                        u.nan_check(u.v2r(param.grad.flatten()) @ layer.weight.pre)
                        param_new_flat = u.v2r(param_data.flatten()) - u.v2r(param.grad.flatten()) @ layer.weight.pre
                        u.nan_check(param_new_flat)
                        param_data.copy_(param_new_flat.reshape(param_data.shape))

                gl.token_count += data.shape[0]

    gl.event_writer.close()
示例#31
0
 def write_lock(self):
   with u.timeit("write_lock"):
     with self.lock:
       yield  
示例#32
0
def main():
  np.random.seed(args.seed)
  tf.set_random_seed(args.seed)

  logger = u.TensorboardLogger(args.run)
  
  with u.timeit("init/session"):
    gpu_options = tf.GPUOptions(allow_growth=False)
    sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))
    u.register_default_session(sess)   # since default session is Thread-local

  with u.timeit("init/model_init"):
    model = model_creator(args.batch_size, name="main")
    model.initialize_global_vars(verbose=True)
    model.initialize_local_vars()
  
  with u.timeit("init/kfac_init"):
    kfac = Kfac(model_creator, args.kfac_batch_size) 
    kfac.model.initialize_global_vars(verbose=False)
    kfac.model.initialize_local_vars()
    kfac.Lambda.set(args.Lambda)
    kfac.reset()    # resets optimization variables (not model variables)

  if args.mode != 'run':
    opt = tf.train.AdamOptimizer(0.001)
  else:
    opt = tf.train.AdamOptimizer(args.lr)
  grads_and_vars = opt.compute_gradients(model.loss,
                                         var_list=model.trainable_vars)
      
  grad = IndexedGrad.from_grads_and_vars(grads_and_vars)
  grad_new = kfac.correct(grad)
  with u.capture_vars() as adam_vars:
    train_op = opt.apply_gradients(grad_new.to_grads_and_vars())
  with u.timeit("init/adam"):
    sessrun([v.initializer for v in adam_vars])
  
  losses = []
  u.record_time()

  start_time = time.time()
  vloss0 = 0

  # todo, unify the two data outputs
  outfn = 'data/%s_%f_%f.csv'%(args.run, args.lr, args.Lambda)
  writer = u.BufferedWriter(outfn, 60)   # get rid?

  start_time = time.time()
  if args.extra_kfac_batch_advance:
    kfac.model.advance_batch()  # advance kfac batch

  if args.kfac_async:
    kfac.start_stats_runners()
    
  for step in range(args.num_steps):
    
    if args.validate_every_n and step%args.validate_every_n == 0:
      loss0, vloss0 = sessrun([model.loss, model.vloss])
    else:
      loss0, = sessrun([model.loss])
    losses.append(loss0)  # TODO: remove this

    logger('loss/loss', loss0, 'loss/vloss', vloss0)
    
    elapsed = time.time()-start_time
    print("%d sec, step %d, loss %.2f, vloss %.2f" %(elapsed, step, loss0,
                                                     vloss0))
    writer.write('%d, %f, %f, %f\n'%(step, elapsed, loss0, vloss0))

    if args.method=='kfac' and not args.kfac_async:
      kfac.model.advance_batch()
      kfac.update_stats()

    with u.timeit("train"):
      model.advance_batch()
      grad.update()
      with kfac.read_lock():
        grad_new.update()
      train_op.run()
      u.record_time()

    logger.next_step()

  # TODO: use u.global_runs_dir
  # TODO: get rid of u.timeit?
  
  with open('timelines/graphdef.txt', 'w') as f:
    f.write(str(u.get_default_graph().as_graph_def()))

  u.summarize_time()
  
  if args.mode == 'record':
    u.dump_with_prompt(losses, release_test_fn)

  elif args.mode == 'test':
    targets = np.loadtxt('data/'+release_test_fn, delimiter=",")
    u.check_equal(losses, targets, rtol=1e-2)
    u.summarize_difference(losses, targets)
示例#33
0
def run():
	timeit(pi_serial, "serial")
示例#34
0
def run_nn(model, loaders, model_name, prt_model = False, case = None):
    print('==='+ model_name +'===<start>')
    
    total = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('# of para: {}'.format(total))
    tr, val, te = loaders

    e = executer(model, model_name, arg, prt_model)
    e.train(tr, val, case)
    
    e.validate(te)
    print('Accuracy: {}'.format(e.hist['val_acc']))

    with open("case"+str(case)+'/'+model_name+".txt", "a") as f:
            print('Case:', str(case), file = f)
            print('Time:', str(timeit()), file = f)
            print('Accuracy: {}'.format(e.hist['val_acc']), file = f)
            print('F1-Score: {}\n'.format(e.f1), file = f)

##Experiments
#------
'''
Case 0: Original setting on datasets(rt)
'''
#x = rt.x
#lbl = rt.y

#x_encoded = ohe().fit_transform_pretrained(x, w2v.getDict())
#loaders = handler().split_tvt((x_encoded, lbl))

#run_nn(models.Transformer(2, 2, d, 1, 1, 2, 256), loaders, 'Transformer.pt', model, 0)
示例#35
0
 def main2(cnt):
     timeit(cPickle.dump, v, open("ESV/pickle", "w"), times=cnt)
     timeit(cPickle.load, open("ESV/pickle"), times=cnt)
示例#36
0
	def main2(cnt):
		timeit(cPickle.dump, v, open("ESV/pickle", "w"), times=cnt)
		timeit(cPickle.load, open("ESV/pickle"), times=cnt)
示例#37
0
def test_main():

    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=10,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')

    parser.add_argument('--wandb',
                        type=int,
                        default=1,
                        help='log to weights and biases')
    parser.add_argument('--autograd_check',
                        type=int,
                        default=0,
                        help='autograd correctness checks')
    parser.add_argument('--logdir',
                        type=str,
                        default='/temp/runs/curv_train_tiny/run')

    parser.add_argument('--train_batch_size', type=int, default=100)
    parser.add_argument('--stats_batch_size', type=int, default=60000)
    parser.add_argument('--dataset_size', type=int, default=60000)
    parser.add_argument('--train_steps',
                        type=int,
                        default=100,
                        help="this many train steps between stat collection")
    parser.add_argument('--stats_steps',
                        type=int,
                        default=1000000,
                        help="total number of curvature stats collections")
    parser.add_argument('--nonlin',
                        type=int,
                        default=1,
                        help="whether to add ReLU nonlinearity between layers")
    parser.add_argument('--method',
                        type=str,
                        choices=['gradient', 'newton'],
                        default='gradient',
                        help="descent method, newton or gradient")
    parser.add_argument('--layer',
                        type=int,
                        default=-1,
                        help="restrict updates to this layer")
    parser.add_argument('--data_width', type=int, default=28)
    parser.add_argument('--targets_width', type=int, default=28)
    parser.add_argument('--lmb', type=float, default=1e-3)
    parser.add_argument(
        '--hess_samples',
        type=int,
        default=1,
        help='number of samples when sub-sampling outputs, 0 for exact hessian'
    )
    parser.add_argument('--hess_kfac',
                        type=int,
                        default=0,
                        help='whether to use KFAC approximation for hessian')
    parser.add_argument('--compute_rho',
                        type=int,
                        default=1,
                        help='use expensive method to compute rho')
    parser.add_argument('--skip_stats',
                        type=int,
                        default=0,
                        help='skip all stats collection')
    parser.add_argument('--full_batch',
                        type=int,
                        default=0,
                        help='do stats on the whole dataset')
    parser.add_argument('--weight_decay', type=float, default=1e-4)

    #args = parser.parse_args()
    args = AttrDict()
    args.lmb = 1e-3
    args.compute_rho = 1
    args.weight_decay = 1e-4
    args.method = 'gradient'
    args.logdir = '/tmp'
    args.data_width = 2
    args.targets_width = 2
    args.train_batch_size = 10
    args.full_batch = False
    args.skip_stats = False
    args.autograd_check = False

    u.seed_random(1)
    logdir = u.create_local_logdir(args.logdir)
    run_name = os.path.basename(logdir)
    #gl.event_writer = SummaryWriter(logdir)
    gl.event_writer = u.NoOp()
    # print(f"Logging to {run_name}")

    # small values for debugging
    # loss_type = 'LeastSquares'
    loss_type = 'CrossEntropy'

    args.wandb = 0
    args.stats_steps = 10
    args.train_steps = 10
    args.stats_batch_size = 10
    args.data_width = 2
    args.targets_width = 2
    args.nonlin = False
    d1 = args.data_width**2
    d2 = 2
    d3 = args.targets_width**2

    d1 = args.data_width**2
    assert args.data_width == args.targets_width
    o = d1
    n = args.stats_batch_size
    d = [d1, 30, 30, 30, 20, 30, 30, 30, d1]

    if loss_type == 'CrossEntropy':
        d3 = 10
    o = d3
    n = args.stats_batch_size
    d = [d1, d2, d3]
    dsize = max(args.train_batch_size, args.stats_batch_size) + 1

    model = u.SimpleFullyConnected2(d, bias=True, nonlin=args.nonlin)
    model = model.to(gl.device)

    try:
        # os.environ['WANDB_SILENT'] = 'true'
        if args.wandb:
            wandb.init(project='curv_train_tiny', name=run_name)
            wandb.tensorboard.patch(tensorboardX=False)
            wandb.config['train_batch'] = args.train_batch_size
            wandb.config['stats_batch'] = args.stats_batch_size
            wandb.config['method'] = args.method
            wandb.config['n'] = n
    except Exception as e:
        print(f"wandb crash with {e}")

    # optimizer = torch.optim.SGD(model.parameters(), lr=0.03, momentum=0.9)
    optimizer = torch.optim.Adam(
        model.parameters(), lr=0.03)  # make 10x smaller for least-squares loss
    dataset = u.TinyMNIST(data_width=args.data_width,
                          targets_width=args.targets_width,
                          dataset_size=dsize,
                          original_targets=True)

    train_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=args.train_batch_size,
        shuffle=False,
        drop_last=True)
    train_iter = u.infinite_iter(train_loader)

    stats_iter = None
    if not args.full_batch:
        stats_loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=args.stats_batch_size,
            shuffle=False,
            drop_last=True)
        stats_iter = u.infinite_iter(stats_loader)

    test_dataset = u.TinyMNIST(data_width=args.data_width,
                               targets_width=args.targets_width,
                               train=False,
                               dataset_size=dsize,
                               original_targets=True)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.train_batch_size,
                                              shuffle=False,
                                              drop_last=True)
    test_iter = u.infinite_iter(test_loader)

    if loss_type == 'LeastSquares':
        loss_fn = u.least_squares
    elif loss_type == 'CrossEntropy':
        loss_fn = nn.CrossEntropyLoss()

    autograd_lib.add_hooks(model)
    gl.token_count = 0
    last_outer = 0
    val_losses = []
    for step in range(args.stats_steps):
        if last_outer:
            u.log_scalars(
                {"time/outer": 1000 * (time.perf_counter() - last_outer)})
        last_outer = time.perf_counter()

        with u.timeit("val_loss"):
            test_data, test_targets = next(test_iter)
            test_output = model(test_data)
            val_loss = loss_fn(test_output, test_targets)
            # print("val_loss", val_loss.item())
            val_losses.append(val_loss.item())
            u.log_scalar(val_loss=val_loss.item())

        # compute stats
        if args.full_batch:
            data, targets = dataset.data, dataset.targets
        else:
            data, targets = next(stats_iter)

        # Capture Hessian and gradient stats
        autograd_lib.enable_hooks()
        autograd_lib.clear_backprops(model)
        autograd_lib.clear_hess_backprops(model)
        with u.timeit("backprop_g"):
            output = model(data)
            loss = loss_fn(output, targets)
            loss.backward(retain_graph=True)
        with u.timeit("backprop_H"):
            autograd_lib.backprop_hess(output, hess_type=loss_type)
        autograd_lib.disable_hooks()  # TODO(y): use remove_hooks

        with u.timeit("compute_grad1"):
            autograd_lib.compute_grad1(model)
        with u.timeit("compute_hess"):
            autograd_lib.compute_hess(model)

        for (i, layer) in enumerate(model.layers):

            # input/output layers are unreasonably expensive if not using Kronecker factoring
            if d[i] > 50 or d[i + 1] > 50:
                print(
                    f'layer {i} is too big ({d[i], d[i + 1]}), skipping stats')
                continue

            if args.skip_stats:
                continue

            s = AttrDefault(str, {})  # dictionary-like object for layer stats

            #############################
            # Gradient stats
            #############################
            A_t = layer.activations
            assert A_t.shape == (n, d[i])

            # add factor of n because backprop takes loss averaged over batch, while we need per-example loss
            B_t = layer.backprops_list[0] * n
            assert B_t.shape == (n, d[i + 1])

            with u.timeit(f"khatri_g-{i}"):
                G = u.khatri_rao_t(B_t, A_t)  # batch loss Jacobian
            assert G.shape == (n, d[i] * d[i + 1])
            g = G.sum(dim=0, keepdim=True) / n  # average gradient
            assert g.shape == (1, d[i] * d[i + 1])

            u.check_equal(G.reshape(layer.weight.grad1.shape),
                          layer.weight.grad1)

            if args.autograd_check:
                u.check_close(B_t.t() @ A_t / n, layer.weight.saved_grad)
                u.check_close(g.reshape(d[i + 1], d[i]),
                              layer.weight.saved_grad)

            s.sparsity = torch.sum(layer.output <= 0) / layer.output.numel(
            )  # proportion of activations that are zero
            s.mean_activation = torch.mean(A_t)
            s.mean_backprop = torch.mean(B_t)

            # empirical Fisher
            with u.timeit(f'sigma-{i}'):
                efisher = G.t() @ G / n
                sigma = efisher - g.t() @ g
                s.sigma_l2 = u.sym_l2_norm(sigma)
                s.sigma_erank = torch.trace(sigma) / s.sigma_l2

            lambda_regularizer = args.lmb * torch.eye(d[i + 1] * d[i]).to(
                gl.device)
            H = layer.weight.hess

            with u.timeit(f"invH-{i}"):
                invH = torch.cholesky_inverse(H + lambda_regularizer)

            with u.timeit(f"H_l2-{i}"):
                s.H_l2 = u.sym_l2_norm(H)
                s.iH_l2 = u.sym_l2_norm(invH)

            with u.timeit(f"norms-{i}"):
                s.H_fro = H.flatten().norm()
                s.iH_fro = invH.flatten().norm()
                s.grad_fro = g.flatten().norm()
                s.param_fro = layer.weight.data.flatten().norm()

            u.nan_check(H)
            if args.autograd_check:
                model.zero_grad()
                output = model(data)
                loss = loss_fn(output, targets)
                H_autograd = u.hessian(loss, layer.weight)
                H_autograd = H_autograd.reshape(d[i] * d[i + 1],
                                                d[i] * d[i + 1])
                u.check_close(H, H_autograd)

            #  u.dump(sigma, f'/tmp/sigmas/H-{step}-{i}')
            def loss_direction(dd: torch.Tensor, eps):
                """loss improvement if we take step eps in direction dd"""
                return u.to_python_scalar(eps * (dd @ g.t()) -
                                          0.5 * eps**2 * dd @ H @ dd.t())

            def curv_direction(dd: torch.Tensor):
                """Curvature in direction dd"""
                return u.to_python_scalar(dd @ H @ dd.t() /
                                          (dd.flatten().norm()**2))

            with u.timeit(f"pinvH-{i}"):
                pinvH = H.pinverse()

            with u.timeit(f'curv-{i}'):
                s.grad_curv = curv_direction(g)
                ndir = g @ pinvH  # newton direction
                s.newton_curv = curv_direction(ndir)
                setattr(layer.weight, 'pre',
                        pinvH)  # save Newton preconditioner
                s.step_openai = s.grad_fro**2 / s.grad_curv if s.grad_curv else 999
                s.step_max = 2 / s.H_l2
                s.step_min = torch.tensor(2) / torch.trace(H)

                s.newton_fro = ndir.flatten().norm(
                )  # frobenius norm of Newton update
                s.regret_newton = u.to_python_scalar(
                    g @ pinvH @ g.t() / 2)  # replace with "quadratic_form"
                s.regret_gradient = loss_direction(g, s.step_openai)

            with u.timeit(f'rho-{i}'):
                p_sigma = u.lyapunov_spectral(H, sigma)

                discrepancy = torch.max(abs(p_sigma - p_sigma.t()) / p_sigma)

                s.psigma_erank = u.sym_erank(p_sigma)
                s.rho = H.shape[0] / s.psigma_erank

            with u.timeit(f"batch-{i}"):
                s.batch_openai = torch.trace(H @ sigma) / (g @ H @ g.t())
                s.diversity = torch.norm(G, "fro")**2 / torch.norm(g)**2 / n

                # Faster approaches for noise variance computation
                # s.noise_variance = torch.trace(H.inverse() @ sigma)
                # try:
                #     # this fails with singular sigma
                #     s.noise_variance = torch.trace(torch.solve(sigma, H)[0])
                #     # s.noise_variance = torch.trace(torch.lstsq(sigma, H)[0])
                #     pass
                # except RuntimeError as _:
                s.noise_variance_pinv = torch.trace(pinvH @ sigma)

                s.H_erank = torch.trace(H) / s.H_l2
                s.batch_jain_simple = 1 + s.H_erank
                s.batch_jain_full = 1 + s.rho * s.H_erank

            u.log_scalars(u.nest_stats(layer.name, s))

        # gradient steps
        with u.timeit('inner'):
            for i in range(args.train_steps):
                optimizer.zero_grad()
                data, targets = next(train_iter)
                model.zero_grad()
                output = model(data)
                loss = loss_fn(output, targets)
                loss.backward()

                #            u.log_scalar(train_loss=loss.item())

                if args.method != 'newton':
                    optimizer.step()
                    if args.weight_decay:
                        for group in optimizer.param_groups:
                            for param in group['params']:
                                param.data.mul_(1 - args.weight_decay)
                else:
                    for (layer_idx, layer) in enumerate(model.layers):
                        param: torch.nn.Parameter = layer.weight
                        param_data: torch.Tensor = param.data
                        param_data.copy_(param_data - 0.1 * param.grad)
                        if layer_idx != 1:  # only update 1 layer with Newton, unstable otherwise
                            continue
                        u.nan_check(layer.weight.pre)
                        u.nan_check(param.grad.flatten())
                        u.nan_check(
                            u.v2r(param.grad.flatten()) @ layer.weight.pre)
                        param_new_flat = u.v2r(param_data.flatten()) - u.v2r(
                            param.grad.flatten()) @ layer.weight.pre
                        u.nan_check(param_new_flat)
                        param_data.copy_(
                            param_new_flat.reshape(param_data.shape))

                gl.token_count += data.shape[0]

    gl.event_writer.close()

    assert val_losses[0] > 2.4  # 2.4828238487243652
    assert val_losses[-1] < 2.25  # 2.20609712600708
示例#38
0
def run():
	"""
	timed runs
	"""
	timeit(partial(calculate_pi_opencl, N), "calculate_pi_opencl")
	timeit(partial(calculate_pi_opencl_simple, N), "calculate_pi_opencl_simple")