def scale_momentum(model, feats_dir, steps, **kwargs): lr = kwargs.get("lr") wd = kwargs.get("wd") momentum = kwargs.get("momentum") dampening = kwargs.get("dampening") lr = np.array(lr, dtype=np.float128) wd = np.array(wd, dtype=np.float128) momentum = np.array(momentum, dtype=np.float128) dampening = np.array(dampening, dtype=np.float128) denom = lr * (1 - dampening) * (1 + momentum) gamma = (1 - momentum) / denom omega = np.sqrt(4 * wd / denom) layers = [layer for layer in utils.get_layers(model) if "conv" in layer] W_0 = utils.load_features( steps=[str(steps[0])], feats_dir=feats_dir, model=model, suffix="weight", group="params", ) b_0 = utils.load_features( steps=[str(steps[0])], feats_dir=feats_dir, model=model, suffix="bias", group="params", ) load_kwargs = { "model": model, "feats_dir": feats_dir, } theory_kwargs = { "lr": lr, "wd": wd, "momentum": momentum, "dampening": dampening, "gamma": gamma, "omega": omega, "W_0": W_0, "b_0": b_0, "step_0": steps[0], } theoretical = {layer: {} for layer in layers} empirical = {layer: {} for layer in layers} for i in tqdm(range(len(steps))): step = steps[i] theory_kwargs["i"] = i load_kwargs["group"] = "buffers" compute_theoretical_momentum( step, layers, load_kwargs, theoretical, **theory_kwargs, ) load_kwargs["group"] = "params" compute_empirical(step, layers, load_kwargs, empirical) return {"empirical": empirical, "theoretical": theoretical}
def gradient(model, feats_dir, steps, **kwargs): layers = [layer for layer in utils.get_layers(model) if "conv" in layer] empirical = {layer: {} for layer in layers} for i in tqdm(range(len(steps))): step = steps[i] if step == 0: continue weight_buffers = utils.load_features( steps=[str(step)], feats_dir=feats_dir, model=model, suffix="weight.grad_norm_buffer", group="buffers", ) bias_buffers = utils.load_features( steps=[str(step)], feats_dir=feats_dir, model=model, suffix="bias.grad_norm_buffer", group="buffers", ) for layer in layers: wl_t = weight_buffers[layer][f"step_{step}"] bl_t = bias_buffers[layer][f"step_{step}"] empirical[layer][step] = utils.in_synapses(wl_t, bl_t) return {"empirical": empirical}
def compute_empirical(step, layers, load_kwargs, empirical): weights = utils.load_features(steps=[str(step)], suffix="weight", **load_kwargs,) biases = utils.load_features(steps=[str(step)], suffix="bias", **load_kwargs,) for layer in layers: Wl_t = weights[layer][f"step_{step}"] bl_t = biases[layer][f"step_{step}"] empirical[layer][step] = utils.in_synapses(Wl_t ** 2, bl_t ** 2)
def compute_empirical(step, layers, load_kwargs, empirical): weights = utils.load_features( steps=[str(step)], suffix="weight", **load_kwargs, ) biases = utils.load_features( steps=[str(step)], suffix="bias", **load_kwargs, ) for layer in layers: wl_t = weights[layer][f"step_{step}"] bl_t = biases[layer][f"step_{step}"] Wl_t = np.column_stack((wl_t, bl_t)) empirical[layer][step] = utils.out_synapses(Wl_t)
def translation(model, feats_dir, steps, **kwargs): lr = kwargs.get("lr") wd = kwargs.get("wd") layers = [ layer for layer in utils.get_layers(model) if "classifier" in layer ] W_0 = utils.load_features( steps=[str(steps[0])], feats_dir=feats_dir, model=model, suffix="weight", group="params", ) b_0 = utils.load_features( steps=[str(steps[0])], feats_dir=feats_dir, model=model, suffix="bias", group="params", ) load_kwargs = { "model": model, "feats_dir": feats_dir, } theory_kwargs = { "lr": lr, "wd": wd, "W_0": W_0, "b_0": b_0, "step_0": steps[0], } theoretical = {layer: {} for layer in layers} empirical = {layer: {} for layer in layers} for i in tqdm(range(len(steps))): step = steps[i] theory_kwargs["i"] = i load_kwargs["group"] = "buffers" compute_theoretical(step, layers, load_kwargs, theoretical, **theory_kwargs) load_kwargs["group"] = "params" compute_empirical(step, layers, load_kwargs, empirical) return {"empirical": empirical, "theoretical": theoretical}
def compute_theoretical( step, layers, load_kwargs, theoretical, i, step_0, lr, wd, W_0, b_0, ): t = lr * step if i > 0: weight_buffers = utils.load_features( steps=[str(step)], suffix="weight.integral_buffer", **load_kwargs, ) bias_buffers = utils.load_features( steps=[str(step)], suffix="bias.integral_buffer", **load_kwargs, ) W_in = np.exp(-2 * wd * t) * W_0[layers[0]][f"step_{step_0}"]**2 b_in = np.exp(-2 * wd * t) * b_0[layers[0]][f"step_{step_0}"]**2 if i > 0: g_W = weight_buffers[layers[0]][f"step_{step}"] g_b = bias_buffers[layers[0]][f"step_{step}"] W_in += (lr**2) * np.exp(-2 * wd * t) * g_W b_in += (lr**2) * np.exp(-2 * wd * t) * g_b for layer in layers[1:]: W_out = np.exp(-2 * wd * t) * W_0[layer][f"step_{step_0}"]**2 b_out = np.exp(-2 * wd * t) * b_0[layer][f"step_{step_0}"]**2 if i > 0: g_W = weight_buffers[layer][f"step_{step}"] g_b = bias_buffers[layer][f"step_{step}"] W_out += (lr**2) * np.exp(-2 * wd * t) * g_W b_out += (lr**2) * np.exp(-2 * wd * t) * g_b theoretical[layer][step] = utils.out_synapses( W_out) - utils.in_synapses(W_in, b_in) W_in = W_out b_in = b_out
def extract_weights_and_grads(step, layers, load_kwargs, weights_and_grads, **kwargs): lr = kwargs.get("lr") wd = kwargs.get("wd") weights = utils.load_features( steps=[str(step)], suffix="weight", group="params", **load_kwargs, ) biases = utils.load_features( steps=[str(step)], suffix="bias", group="params", **load_kwargs, ) weight_buffers = utils.load_features( steps=[str(step)], suffix="weight.grad_buffer", group="buffers", **load_kwargs, ) bias_buffers = utils.load_features( steps=[str(step)], suffix="bias.grad_buffer", group="buffers", **load_kwargs, ) for layer in layers: Wl_t = weights[layer][f"step_{step}"] bl_t = biases[layer][f"step_{step}"] weights_and_grads[layer]["weight"].append( np.concatenate((Wl_t.flatten(), bl_t.flatten())) ) g_Wl_t = weight_buffers[layer][f"step_{step}"] g_bl_t = bias_buffers[layer][f"step_{step}"] weights_and_grads[layer]["grad"].append( np.concatenate((g_Wl_t.flatten(), g_bl_t.flatten())) )
def compute_empirical(step, layers, load_kwargs, empirical): weights = utils.load_features( steps=[str(step)], suffix="weight", **load_kwargs, ) biases = utils.load_features( steps=[str(step)], suffix="bias", **load_kwargs, ) W_in = weights[layers[0]][f"step_{step}"]**2 b_in = biases[layers[0]][f"step_{step}"]**2 for layer in layers[1:]: W_out = weights[layer][f"step_{step}"]**2 b_out = biases[layer][f"step_{step}"]**2 empirical[layer][step] = utils.out_synapses(W_out) - utils.in_synapses( W_in, b_in) W_in = W_out b_in = b_out
def compute_pos_vel(step, layers, load_kwargs, position, velocity, **kwargs): lr = kwargs.get("lr") wd = kwargs.get("wd") weights = utils.load_features( steps=[str(step)], suffix="weight", group="params", **load_kwargs, ) biases = utils.load_features( steps=[str(step)], suffix="bias", group="params", **load_kwargs, ) weight_buffers = utils.load_features( steps=[str(step)], suffix="weight.grad_norm_buffer", group="buffers", **load_kwargs, ) bias_buffers = utils.load_features( steps=[str(step)], suffix="bias.grad_norm_buffer", group="buffers", **load_kwargs, ) for layer in layers: Wl_t = weights[layer][f"step_{step}"] bl_t = biases[layer][f"step_{step}"] position[layer][step] = utils.in_synapses(Wl_t ** 2, bl_t ** 2) g_Wl_t = weight_buffers[layer][f"step_{step}"] g_bl_t = bias_buffers[layer][f"step_{step}"] # -2lambda |\theta|^2 + \eta(|g|^2 - \lambda^2|\theta|^2) velocity[layer][step] = lr*utils.in_synapses(g_Wl_t, g_bl_t) velocity[layer][step] -= (2*wd + lr*wd**2)*position[layer][step]
def network(model, feats_dir, steps, **kwargs): subset = kwargs.get("subset", None) seed = kwargs.get("seed", 0) layers = [layer for layer in utils.get_layers(model)] empirical = {layer: {} for layer in layers} for i in range(len(steps)): step = steps[i] weights = utils.load_features( steps=[str(step)], feats_dir=feats_dir, model=model, suffix="weight", group="params", ) biases = utils.load_features( steps=[str(step)], feats_dir=feats_dir, model=model, suffix="bias", group="params", ) np.random.seed(seed) for layer in layers: Wl_t = weights[layer][f"step_{step}"] bl_t = biases[layer][f"step_{step}"] all_weights = np.concatenate((Wl_t.reshape(-1), bl_t.reshape(-1))) if subset is None: random_subset_idx = np.arange(len(all_weights)) else: random_subset_idx = np.random.choice(len(all_weights), size=min( subset, len(all_weights)), replace=False) empirical[layer][step] = all_weights[random_subset_idx] return {"empirical": empirical}
def compute_theoretical( step, layers, load_kwargs, theoretical, i, step_0, lr, wd, W_0, b_0, ): t = lr * step if i > 0: weight_buffers = utils.load_features( steps=[str(step)], suffix="weight.integral_buffer", **load_kwargs, ) bias_buffers = utils.load_features( steps=[str(step)], suffix="bias.integral_buffer", **load_kwargs, ) for layer in layers: Wl_0 = W_0[layer][f"step_{step_0}"] bl_0 = b_0[layer][f"step_{step_0}"] theoretical[layer][step] = np.exp(-2 * wd * t) * utils.in_synapses( Wl_0 ** 2, bl_0 ** 2 ) if i > 0: g_Wl_t = weight_buffers[layer][f"step_{step}"] g_bl_t = bias_buffers[layer][f"step_{step}"] theoretical[layer][step] += ( (lr ** 2) * np.exp(-2 * wd * t) * utils.in_synapses(g_Wl_t, g_bl_t) )
def compute_theoretical_momentum( step, layers, load_kwargs, theoretical, i, step_0, lr, wd, momentum, dampening, omega, gamma, W_0, b_0, ): t = lr * (1 - dampening) * step if i > 0: weight_buffers_1 = utils.load_features( steps=[str(step)], suffix="weight.integral_buffer_1", **load_kwargs, ) bias_buffers_1 = utils.load_features( steps=[str(step)], suffix="bias.integral_buffer_1", **load_kwargs, ) weight_buffers_2 = utils.load_features( steps=[str(step)], suffix="weight.integral_buffer_2", **load_kwargs, ) bias_buffers_2 = utils.load_features( steps=[str(step)], suffix="bias.integral_buffer_2", **load_kwargs, ) for layer in layers: Wl_0 = W_0[layer][f"step_{step_0}"] bl_0 = b_0[layer][f"step_{step_0}"] if gamma < omega: cos = np.cos(np.sqrt(omega ** 2 - gamma ** 2) * t) sin = np.sin(np.sqrt(omega ** 2 - gamma ** 2) * t) scale = np.exp(-gamma * t) * ( cos + gamma / np.sqrt(omega ** 2 - gamma ** 2) * sin ) elif gamma == omega: scale = np.exp(-gamma * t) * (1 + gamma * t) else: alpha_p = -gamma + np.sqrt(gamma ** 2 - omega ** 2) alpha_m = -gamma - np.sqrt(gamma ** 2 - omega ** 2) numer = alpha_p * np.exp(alpha_m * t) - alpha_m * np.exp(alpha_p * t) denom = alpha_p - alpha_m scale = numer / denom theoretical[layer][step] = scale * utils.in_synapses( Wl_0 ** 2, bl_0 ** 2, dtype=np.float128 ) if i > 0: g_Wl_t_1 = weight_buffers_1[layer][f"step_{step}"] g_bl_t_1 = bias_buffers_1[layer][f"step_{step}"] g_Wl_t_2 = weight_buffers_2[layer][f"step_{step}"] g_bl_t_2 = bias_buffers_2[layer][f"step_{step}"] if gamma < omega: sqrt = np.sqrt(omega ** 2 - gamma ** 2) scale_1 = np.exp(-gamma * t) * np.sin(sqrt * t) / sqrt scale_2 = -np.exp(-gamma * t) * np.cos(sqrt * t) / sqrt elif gamma == omega: scale_1 = np.exp(-gamma * t) * t scale_2 = -np.exp(-gamma * t) else: sqrt = np.sqrt(gamma ** 2 - omega ** 2) alpha_p = -gamma + sqrt alpha_m = -gamma - sqrt scale_1 = np.exp(alpha_p * t) / (alpha_p - alpha_m) scale_2 = -np.exp(alpha_m * t) / (alpha_p - alpha_m) scale = (lr * (1 - dampening)) * 2 if np.all(np.isfinite(g_Wl_t_1)) and np.all(np.isfinite(g_bl_t_1)): theoretical[layer][step] += ( scale * scale_1 * utils.in_synapses(g_Wl_t_1, g_bl_t_1, dtype=np.float128) ) if np.all(np.isfinite(g_Wl_t_2)) and np.all(np.isfinite(g_bl_t_2)): theoretical[layer][step] += ( scale * scale_2 * utils.in_synapses(g_Wl_t_2, g_bl_t_2, dtype=np.float128) )