def testL2(self): d = 5 c = 10000 r = 20 vec = torch.randn(d).to(self.device) a = CSVec(d, c, r, **self.csvecArgs) a += vec tol = 0.0001 self.assertTrue((a.l2estimate() - vec.norm()).abs() < tol)
def testMedian(self): d = 5 c = 10000 r = 20 csvecs = [CSVec(d, c, r, **self.csvecArgs) for _ in range(3)] for i, csvec in enumerate(csvecs): vec = torch.arange(d).float().to(self.device) + i csvec.accumulateVec(vec) median = CSVec.median(csvecs) recovered = median.unSketch(k=d) trueMedian = torch.arange(d).float().to(self.device) + 1 self.assertTrue(torch.allclose(recovered, trueMedian))
def testZeroSketch(self): d = 100 c = 20 r = 5 a = CSVec(d, c, r, **self.csvecArgs) vec = torch.rand(d).to(self.device) a += vec zeros = torch.zeros((r, c)).to(self.device) self.assertFalse(torch.allclose(a.table, zeros)) a.zero() self.assertTrue(torch.allclose(a.table, zeros))
def testRandomness(self): # make sure two sketches get the same hashes and signs d = 100 c = 20 r = 5 a = CSVec(d, c, r, **self.csvecArgs) b = CSVec(d, c, r, **self.csvecArgs) self.assertTrue(torch.allclose(a.signs, b.signs)) self.assertTrue(torch.allclose(a.buckets, b.buckets)) self.assertTrue(torch.allclose(a.signs, b.signs)) if self.numBlocks > 1: self.assertTrue(torch.allclose(a.blockOffsets, b.blockOffsets)) self.assertTrue(torch.allclose(a.blockSigns, b.blockSigns))
def testSketchSum(self): d = 5 c = 10000 r = 20 summed = CSVec(d, c, r, **self.csvecArgs) for i in range(d): vec = torch.zeros(d).to(self.device) vec[i] = 1 sketch = CSVec(d, c, r, **self.csvecArgs) sketch += vec summed += sketch recovered = summed.unSketch(k=d) trueSum = torch.ones(d).to(self.device) self.assertTrue(torch.allclose(recovered, trueSum))
def testInit(self): # make sure the table starts out zeroed d = 100 c = 20 r = 5 a = CSVec(d, c, r, **self.csvecArgs) zeros = torch.zeros(r, c).to(self.device) self.assertTrue(torch.allclose(a.table, zeros))
def testSketchVec(self): # sketch a vector with all zeros except a single 1 # then the table should be zeros everywhere except a single # 1 in each row d = 100 c = 1 r = 5 a = CSVec(d=d, c=c, r=r, **self.csvecArgs) vec = torch.zeros(d).to(self.device) vec[0] = 1 a.accumulateVec(vec) # make sure the sketch only has one nonzero entry per row for i in range(r): with self.subTest(row=i): self.assertEqual(a.table[i, :].nonzero().numel(), 1) # make sure each row sums to +-1 summed = a.table.abs().sum(dim=1).view(-1) ones = torch.ones(r).to(self.device) self.assertTrue(torch.allclose(summed, ones))
def testSameBuckets(self): d = 100 c = 20 r = 5 h = 0 a = CSVec(d, c, r, **self.csvecArgs) vec = torch.randn(d) a += vec b = TopHCS(h=h, d=d, c=c, r=r, **self.csvecArgs) b.store(vec) self.assertTrue(torch.allclose(a.table, b.csvec.table))
def testUnsketch(self): # make sure heavy hitter recovery works correctly # use a gigantic sketch so there's no chance of collision d = 5 c = 10000 r = 20 a = CSVec(d, c, r, **self.csvecArgs) vec = torch.rand(d).to(self.device) a += vec with self.subTest(method="topk"): recovered = a.unSketch(k=d) self.assertTrue(torch.allclose(recovered, vec)) with self.subTest(method="epsilon"): thr = vec.abs().min() * 0.9 recovered = a.unSketch(epsilon=thr / vec.norm()) self.assertTrue(torch.allclose(recovered, vec))
class TopHCS(object): """ Represents one worker""" def __init__(self, d, c, r, h, numBlocks, device='cpu'): self.h, self.d = h, d self.device = device self.topH = torch.zeros(d, dtype=torch.float, device=self.device) self.csvec = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=self.device) def zero(self): """ Clear csvec and topH tensor """ self.csvec.zero() self.topH = torch.zeros(self.d, dtype=torch.float, device=self.device) # formerly store(...) def accumulateVec(self, vec): """ Compresses vector """ """ Save top-h elements in self.topH, sketch bottom d-h elements """ """ csvec and topH should be zero before storing """ # assert(self.topH.nonzero().numel() == 0) # changed this for commefficient optimizer self.topH = topk(vec, self.h).to(self.device) self.csvec.accumulateVec((vec - self.topH).to(self.device)) def accumulateTable(self, table): if table.size() != self.csvec.table.size(): msg = "Passed in table has size {}, expecting {}" raise ValueError(msg.format(table.size(), self.csvec.table.size())) else: self.csvec.accumulateTable(table) @classmethod def topKSum(cls, workers, k, unSketchNum=0): assert isinstance(workers, list), "workers must be a list" sketchSum = copy.deepcopy(workers[0].csvec) sketchSum.zero() topHSum = torch.zeros_like(workers[0].topH) for w in workers: sketchSum.accumulateTable(w.csvec.table) topHSum += w.topH d = len(topHSum) unSketchNum = d if (unSketchNum == 0) else unSketchNum unSketchedSum = sketchSum.unSketch(k=unSketchNum) if topHSum.size() != unSketchedSum.size(): msg = "topHSum has size {}, unSketchedSum size {}" raise ValueError(msg.format(topHSum.size(), unSketchedSum.size())) ret = topk(topHSum + unSketchedSum, k) return ret
def __init__(self, opt, c, r, numWorkers, numBlocks=1, method="sketch"): """SketchedSum constructor Args: opt: an instance of torch.optim.SGD whose momentum and weight decay we want to emulate c: number of columns in the sketch r: numbers of rows in the sketch numWorkers: how many workers to divide the gradient computation among numBlocks: memory optimization for the sketch (higher means less memory used, but randomness becomes correlated) method: which communication protocol to use. Options: sketch: send a sketch of the v vectors trueTopk: send the whole v vector and use the topk of the sum of vs over workers as the weight update localTopk: send and then sum the local topk of each worker's v vector randomK: send a random set of k coordinates signum: signSGD with majority vote Pkk: send local top-Pk, of which topk is used as the weight update """ self.opt = opt self.c = c self.r = r self.numWorkers = numWorkers # at most one of true topk, local topk, and random k allowed # (what can I say -- I don't believe in implicit casting?) methods = [ "sketch", "trueTopk", "localTopk", "randomK", "signum", "Pkk" ] if method not in methods: msg = "Invalid method {}. Valid options are {}" raise ValueError(msg.format(method, ",".join(methods))) self.method = method # used for debugging self._doSlowSketching = True # self.modelDevice is not tested... not sure what happens if # the model is on the CPU if opt.param_groups[0]["params"][0].is_cuda: self.modelDevice = "cuda" else: self.modelDevice = "cpu" self.device = self.modelDevice print("device", self.device) D = 0 sketchMask = [] for group in opt.param_groups: for p in group["params"]: if p.requires_grad: size = np.prod(p.data.shape) if p.do_sketching: sketchMask.append(torch.ones(size)) else: sketchMask.append(torch.zeros(size)) D += size self.D = D # a mask indicating which gradient elements we should sketch # and which we should send without compression (e.g. bias terms, # maybe early layers, etc.) self.sketchMask = torch.cat(sketchMask).bool().to(self.device) print("D: {}".format(D)) print("sketchMask.sum(): {}".format(self.sketchMask.sum())) self.us = [ torch.zeros(D, device=self.device) for _ in range(numWorkers) ] self.vs = [ torch.zeros(D, device=self.device) for _ in range(numWorkers) ] # don't need sketches for true/local/random topk if self.method == "sketch": print("making sketches") # dimensionality of the sketch (d) is the number of gradient # elements that we're going to sketch, i.e. sketchMask.sum() self.workerSketches = [ CSVec(d=self.sketchMask.sum().item(), c=c, r=r, device=self.device, numBlocks=numBlocks) for _ in range(numWorkers) ] else: print("not making sketches")
def args2sketch(args): return CSVec(d=args.grad_size, c=args.num_cols, r=args.num_rows, device=args.device, numBlocks=args.num_blocks)
kVals = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] hVals = [0, 1.0] cVals = [50, 100, 1000, 10000] cVals = [20] for p, cols in enumerate(cVals): csvec_accuracy = np.zeros(len(kVals)) topHCS_accuracy = np.zeros((len(hVals), len(kVals))) for k_i, k in enumerate(kVals): d, c, r, numBlocks = len(summed), cols, 15, 1 #ipdb.set_trace() expected = torch.zeros(len(summed), device=device) expected[expectedIndices[-k:].to(device)] = summed[ expectedIndices[-k:].to(device)] assert (summed.size() == vecs[0].size()) w_0 = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=device) w_0 += vecs[0] print("") w_1 = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=device) w_1 += vecs[1] print("") w_2 = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=device) w_2 += vecs[2] print("") w_3 = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=device) w_3 += vecs[3] print("") workers_summed = w_0 + w_1 + w_2 + w_3
# init TopHCS stuff indexAcc_2 = np.zeros((len(hVals), len(kVals))) L1_2 = np.zeros((len(hVals), len(kVals))) L2_2 = np.zeros((len(hVals), len(kVals))) for k_i, k in enumerate(kVals): k = int(k) #stupid expected = torch.zeros(d, device=device) expectedIndices = torch.sort(summed**2)[1][-k:] expected[expectedIndices.to(device)] = summed[expectedIndices.to( device)] # CSVecs workers = [] for vec in vecs: w = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=device) w += vec workers.append(w) # Summing CSVecs into 1st worker to save memory for w in workers[1:]: workers[0] += w recovered_1 = workers[0].unSketch(k) indexAcc_1[k_i] = (expected[expectedIndices] * recovered_1[expectedIndices]).nonzero().numel() / k diff = recovered_1[recovered_1 != 0] - expected[recovered_1 != 0] L1_1[k_i] = torch.median(torch.abs(diff)) L2_1[k_i] = torch.median(diff**2) for h_i, hVal in enumerate(hVals):
def __init__(self, opt, c, r, numWorkers, numBlocks=1, doTrueTopk=False, doLocalTopk=False, doRandomK=False): """SketchedSum constructor Args: opt: an instance of torch.optim.SGD whose momentum and weight decay we want to emulate c: number of columns in the sketch r: numbers of rows in the sketch numWorkers: how many workers to divide the gradient computation among numBlocks: memory optimization for the sketch (higher means less memory used, but randomness becomes correlated) doTrueTopk: instead of sketching, compute the true topk of the sum of the workers' gradients doLocalTopk: instead of sketching, send and then sum the local topk of each worker's v vector doRandomK: instead of sketching, send a random set of k coordinates """ self.opt = opt self.c = c self.r = r self.numWorkers = numWorkers # at most one of true topk, local topk, and random k allowed # (what can I say -- I don't believe in implicit casting?) assert (((1 if doTrueTopk else 0) + (1 if doLocalTopk else 0) + (1 if doRandomK else 0)) <= 1) self.doTrueTopk = doTrueTopk self.doLocalTopk = doLocalTopk self.doRandomK = doRandomK self.doSketching = not (doTrueTopk or doLocalTopk or doRandomK) # used for debugging self._doSlowSketching = False # self.modelDevice is not tested... not sure what happens if # the model is on the CPU if opt.param_groups[0]["params"][0].is_cuda: self.modelDevice = "cuda" else: self.modelDevice = "cpu" self.device = self.modelDevice print("device", self.device) D = 0 sketchMask = [] for group in opt.param_groups: for p in group["params"]: if p.requires_grad: size = np.prod(p.data.shape) if p.do_sketching: sketchMask.append(torch.ones(size)) else: sketchMask.append(torch.zeros(size)) D += size self.D = D # a mask indicating which gradient elements we should sketch # and which we should send without compression (e.g. bias terms, # maybe early layers, etc.) self.sketchMask = torch.cat(sketchMask).byte().to(self.device) print("D: {}".format(D)) print("sketchMask.sum(): {}".format(self.sketchMask.sum())) self.us = [ torch.zeros(D, device=self.device) for _ in range(numWorkers) ] self.vs = [ torch.zeros(D, device=self.device) for _ in range(numWorkers) ] # don't need sketches for true/local/random topk if self.doSketching: print("making sketches") # dimensionality of the sketch (d) is the number of gradient # elements that we're going to sketch, i.e. sketchMask.sum() self.workerSketches = [ CSVec(d=self.sketchMask.sum().item(), c=c, r=r, device=self.device, numBlocks=numBlocks) for _ in range(numWorkers) ] else: print("not making sketches")
sampler.sample_iid(comm) # Instantiate Model model = flnet_init() # Set compressibility writer current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") c_log_dir = 'logs/' + base_model + '/' + sample + '/' + fbk_status + '/fetchsgd_k' + str( k) + "_r" + str(r) + "_c_" + str(c) + '_' + str(rank) + '_alpha' + str( alpha) + '/' + current_time + '/sparse_pt' c_summary_writer = tf.summary.create_file_writer(c_log_dir) d = np.sum([np.prod(v.get_shape()) for v in model.trainable_weights]) # Send sketch class to clients S = CSVec(d, c, r) for n in range(1, N + 1): comm.send([S.buckets, S.signs], dest=n, tag=11) # Residual sketch S_e = CSVec(d, c, r, doInitialize=False) S_e.buckets, S_e.signs = S.buckets, S.signs S_e.table = torch.zeros((r, c)) for epoch in range(num_epoch): if sample == "non_iid2": sampler.sample_noniid(2, comm) # dummy dataset to get number of steps dset = tf.data.Dataset.from_tensor_slices( (sampler.x_train_local, sampler.y_train_local))
def __init__(self, d, c, r, h, numBlocks, device='cpu'): self.h, self.d = h, d self.device = device self.topH = torch.zeros(d, dtype=torch.float, device=self.device) self.csvec = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=self.device)
hVals = [d] cVals = [180000] for c, cols in enumerate(cVals): csvecAcc = np.zeros(len(kVals)) topHCSAcc = np.zeros((len(hVals), len(kVals))) csvecL2 = np.zeros(len(kVals)) topHCSL2 = np.zeros((len(hVals), len(kVals))) for k_i, k in enumerate(kVals): d, c, r, numBlocks = len(summed), cols, 15, 30 #ipdb.set_trace() expected = torch.zeros(len(summed), device=device) expected[expectedIndices[-k:].to(device)] = summed[ expectedIndices[-k:].to(device)] assert (summed.size() == vecs[0].size()) w_0 = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=device) w_0 += vecs[0] print("worker added") result = w_0.unSketch(k) csvecAcc[k_i] = (expected[expectedIndices] * result[expectedIndices]).nonzero().numel() / k print("k = {}".format(k)) csvecL2[k_i] = (torch.sum((result - expected)**2))**0.5 for h_i, h in enumerate(hVals): result = torch.zeros(len(summed), device=device) w_0 = TopHCS(d=d, c=c, r=r, h=h, numBlocks=numBlocks,
def forward_grad(model, batch, compute_loss, args, compute_grad=True): device = args.device # divide up batch (for gradient accumulation when memory constrained) #num_shards = args.num_train_batch_shards # need the max(1, ...) since the last batch in an epoch might be small #microbatch_size = max(1, batch[0].size()[0] // num_shards) if args.microbatch_size > 0: microbatch_size = min(batch[0].size()[0], args.microbatch_size) else: microbatch_size = batch[0].size()[0] # accumulators for the loss & metric values accum_loss = 0 accum_metrics = None num_iters = math.ceil(batch[0].size()[0] / microbatch_size) for i in range(num_iters): # extract current microbatch start = i * microbatch_size end = (i+1) * microbatch_size microbatch = [t[start:end] for t in batch] # forward pass loss, *metrics = compute_loss(model, microbatch, args) # if first time through, we find out how many metrics there are if accum_metrics is None: accum_metrics = [0 for _ in metrics] # accumulate loss & metrics, weighted by how many data points # were actually used accum_loss += loss.item() * microbatch[0].size()[0] for i, m in enumerate(metrics): accum_metrics[i] += m.item() * microbatch[0].size()[0] # backward pass if compute_grad: loss.backward() # gradient clipping if compute_grad and args.max_grad_norm is not None and args.mode not in ["sketch"]: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm * num_iters) # "average" here is over the data in the batch average_loss = accum_loss / batch[0].size()[0] average_metrics = [m / batch[0].size()[0] for m in accum_metrics] results = [average_loss] + average_metrics if not compute_grad: return results grad = get_grad(model, args) if args.do_dp: grad = clip_grad(args.l2_norm_clip, grad) if args.dp_mode == "worker": noise = torch.normal(mean=0, std=args.noise_multiplier, size=grad.size()).to(args.device) noise *= np.sqrt(args.num_workers) grad += noise # compress the gradient if needed if args.mode == "sketch": sketch = CSVec(d=args.grad_size, c=args.num_cols, r=args.num_rows, device=args.device, numBlocks=args.num_blocks) sketch.accumulateVec(grad) # gradient clipping if compute_grad and args.max_grad_norm is not None: sketch = clip_grad(args.max_grad_norm, sketch) g = sketch.table elif args.mode == "true_topk": g = grad elif args.mode == "local_topk": # ideally we'd return the compressed version of the gradient, # i.e. _topk(grad, k=args.k). However, for sketching we do momentum # in the sketch, whereas for topk we do momentum before taking topk # so we have to return an inconsistent quantity here g = grad elif args.mode == "fedavg": # logic for doing fedavg happens in process_batch g = grad elif args.mode == "uncompressed": g = grad return g, results
def init_sketch(self): mdl_cfg = ConfigurationManager().model_config return CSVec(d=self.sketch_dim, c=mdl_cfg.col_num, r=mdl_cfg.row_num, numBlocks=mdl_cfg.block_num)