示例#1
0
        def testL2(self):
            d = 5
            c = 10000
            r = 20

            vec = torch.randn(d).to(self.device)
            a = CSVec(d, c, r, **self.csvecArgs)
            a += vec

            tol = 0.0001
            self.assertTrue((a.l2estimate() - vec.norm()).abs() < tol)
示例#2
0
        def testMedian(self):
            d = 5
            c = 10000
            r = 20

            csvecs = [CSVec(d, c, r, **self.csvecArgs) for _ in range(3)]
            for i, csvec in enumerate(csvecs):
                vec = torch.arange(d).float().to(self.device) + i
                csvec.accumulateVec(vec)
            median = CSVec.median(csvecs)
            recovered = median.unSketch(k=d)
            trueMedian = torch.arange(d).float().to(self.device) + 1
            self.assertTrue(torch.allclose(recovered, trueMedian))
示例#3
0
        def testZeroSketch(self):
            d = 100
            c = 20
            r = 5
            a = CSVec(d, c, r, **self.csvecArgs)
            vec = torch.rand(d).to(self.device)
            a += vec

            zeros = torch.zeros((r, c)).to(self.device)
            self.assertFalse(torch.allclose(a.table, zeros))

            a.zero()
            self.assertTrue(torch.allclose(a.table, zeros))
示例#4
0
        def testRandomness(self):
            # make sure two sketches get the same hashes and signs
            d = 100
            c = 20
            r = 5
            a = CSVec(d, c, r, **self.csvecArgs)
            b = CSVec(d, c, r, **self.csvecArgs)
            self.assertTrue(torch.allclose(a.signs, b.signs))
            self.assertTrue(torch.allclose(a.buckets, b.buckets))
            self.assertTrue(torch.allclose(a.signs, b.signs))

            if self.numBlocks > 1:
                self.assertTrue(torch.allclose(a.blockOffsets, b.blockOffsets))
                self.assertTrue(torch.allclose(a.blockSigns, b.blockSigns))
示例#5
0
        def testSketchSum(self):
            d = 5
            c = 10000
            r = 20

            summed = CSVec(d, c, r, **self.csvecArgs)
            for i in range(d):
                vec = torch.zeros(d).to(self.device)
                vec[i] = 1
                sketch = CSVec(d, c, r, **self.csvecArgs)
                sketch += vec
                summed += sketch

            recovered = summed.unSketch(k=d)
            trueSum = torch.ones(d).to(self.device)
            self.assertTrue(torch.allclose(recovered, trueSum))
示例#6
0
 def testInit(self):
     # make sure the table starts out zeroed
     d = 100
     c = 20
     r = 5
     a = CSVec(d, c, r, **self.csvecArgs)
     zeros = torch.zeros(r, c).to(self.device)
     self.assertTrue(torch.allclose(a.table, zeros))
示例#7
0
        def testSketchVec(self):
            # sketch a vector with all zeros except a single 1
            # then the table should be zeros everywhere except a single
            # 1 in each row
            d = 100
            c = 1
            r = 5
            a = CSVec(d=d, c=c, r=r, **self.csvecArgs)
            vec = torch.zeros(d).to(self.device)
            vec[0] = 1
            a.accumulateVec(vec)
            # make sure the sketch only has one nonzero entry per row
            for i in range(r):
                with self.subTest(row=i):
                    self.assertEqual(a.table[i, :].nonzero().numel(), 1)

            # make sure each row sums to +-1
            summed = a.table.abs().sum(dim=1).view(-1)
            ones = torch.ones(r).to(self.device)
            self.assertTrue(torch.allclose(summed, ones))
示例#8
0
 def testSameBuckets(self):
     d = 100
     c = 20
     r = 5
     h = 0
     a = CSVec(d, c, r, **self.csvecArgs)
     vec = torch.randn(d)
     a += vec
     b = TopHCS(h=h, d=d, c=c, r=r, **self.csvecArgs)
     b.store(vec)
     self.assertTrue(torch.allclose(a.table, b.csvec.table))
示例#9
0
        def testUnsketch(self):
            # make sure heavy hitter recovery works correctly

            # use a gigantic sketch so there's no chance of collision
            d = 5
            c = 10000
            r = 20
            a = CSVec(d, c, r, **self.csvecArgs)
            vec = torch.rand(d).to(self.device)

            a += vec

            with self.subTest(method="topk"):
                recovered = a.unSketch(k=d)
                self.assertTrue(torch.allclose(recovered, vec))

            with self.subTest(method="epsilon"):
                thr = vec.abs().min() * 0.9
                recovered = a.unSketch(epsilon=thr / vec.norm())
                self.assertTrue(torch.allclose(recovered, vec))
示例#10
0
class TopHCS(object):
	""" Represents one worker"""
	def __init__(self, d, c, r, h, numBlocks, device='cpu'): 
		self.h, self.d = h, d 
		self.device = device
		self.topH = torch.zeros(d, dtype=torch.float, device=self.device)
		self.csvec = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=self.device)
	
	def zero(self):
		""" Clear csvec and topH tensor """
		self.csvec.zero()
		self.topH = torch.zeros(self.d, dtype=torch.float, device=self.device)
        
        # formerly store(...)
	def accumulateVec(self, vec):
		""" Compresses vector """
		""" Save top-h elements in self.topH, sketch bottom d-h elements """
		""" csvec and topH should be zero before storing """
		# assert(self.topH.nonzero().numel() == 0)
                # changed this for commefficient optimizer
		self.topH = topk(vec, self.h).to(self.device)
		self.csvec.accumulateVec((vec - self.topH).to(self.device))
        
	def accumulateTable(self, table):
		if table.size() != self.csvec.table.size():
			msg = "Passed in table has size {}, expecting {}"
			raise ValueError(msg.format(table.size(), self.csvec.table.size()))
		else:
			self.csvec.accumulateTable(table)

	@classmethod
	def topKSum(cls, workers, k, unSketchNum=0):
		assert isinstance(workers, list), "workers must be a list"
		sketchSum = copy.deepcopy(workers[0].csvec) 
		sketchSum.zero()
		topHSum = torch.zeros_like(workers[0].topH)
		for w in workers:
			sketchSum.accumulateTable(w.csvec.table)
			topHSum += w.topH
		d = len(topHSum)
		unSketchNum = d if (unSketchNum == 0) else unSketchNum
		unSketchedSum = sketchSum.unSketch(k=unSketchNum) 
		if topHSum.size() != unSketchedSum.size():
			msg = "topHSum has size {}, unSketchedSum size {}"
			raise ValueError(msg.format(topHSum.size(), unSketchedSum.size()))
		ret = topk(topHSum + unSketchedSum, k)
		return ret
    def __init__(self, opt, c, r, numWorkers, numBlocks=1, method="sketch"):
        """SketchedSum constructor

        Args:
            opt: an instance of torch.optim.SGD whose momentum and weight
                 decay we want to emulate
            c: number of columns in the sketch
            r: numbers of rows in the sketch
            numWorkers: how many workers to divide the gradient
                        computation among
            numBlocks: memory optimization for the sketch (higher means
                       less memory used, but randomness becomes correlated)
            method: which communication protocol to use. Options:
                sketch: send a sketch of the v vectors
                trueTopk: send the whole v vector and use the topk of
                          the sum of vs over workers as the weight update
                localTopk: send and then sum the local topk of each
                           worker's v vector
                randomK: send a random set of k coordinates
                signum: signSGD with majority vote
                Pkk: send local top-Pk, of which topk is used as the
                     weight update
        """
        self.opt = opt
        self.c = c
        self.r = r
        self.numWorkers = numWorkers
        # at most one of true topk, local topk, and random k allowed
        # (what can I say -- I don't believe in implicit casting?)
        methods = [
            "sketch", "trueTopk", "localTopk", "randomK", "signum", "Pkk"
        ]
        if method not in methods:
            msg = "Invalid method {}. Valid options are {}"
            raise ValueError(msg.format(method, ",".join(methods)))
        self.method = method

        # used for debugging
        self._doSlowSketching = True

        # self.modelDevice is not tested... not sure what happens if
        # the model is on the CPU
        if opt.param_groups[0]["params"][0].is_cuda:
            self.modelDevice = "cuda"
        else:
            self.modelDevice = "cpu"
        self.device = self.modelDevice
        print("device", self.device)

        D = 0
        sketchMask = []
        for group in opt.param_groups:
            for p in group["params"]:
                if p.requires_grad:
                    size = np.prod(p.data.shape)
                    if p.do_sketching:
                        sketchMask.append(torch.ones(size))
                    else:
                        sketchMask.append(torch.zeros(size))
                    D += size
        self.D = D
        # a mask indicating which gradient elements we should sketch
        # and which we should send without compression (e.g. bias terms,
        # maybe early layers, etc.)
        self.sketchMask = torch.cat(sketchMask).bool().to(self.device)

        print("D: {}".format(D))
        print("sketchMask.sum(): {}".format(self.sketchMask.sum()))

        self.us = [
            torch.zeros(D, device=self.device) for _ in range(numWorkers)
        ]
        self.vs = [
            torch.zeros(D, device=self.device) for _ in range(numWorkers)
        ]

        # don't need sketches for true/local/random topk
        if self.method == "sketch":
            print("making sketches")
            # dimensionality of the sketch (d) is the number of gradient
            # elements that we're going to sketch, i.e. sketchMask.sum()
            self.workerSketches = [
                CSVec(d=self.sketchMask.sum().item(),
                      c=c,
                      r=r,
                      device=self.device,
                      numBlocks=numBlocks) for _ in range(numWorkers)
            ]
        else:
            print("not making sketches")
示例#12
0
def args2sketch(args):
    return CSVec(d=args.grad_size,
                 c=args.num_cols,
                 r=args.num_rows,
                 device=args.device,
                 numBlocks=args.num_blocks)
示例#13
0
    kVals = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    hVals = [0, 1.0]
    cVals = [50, 100, 1000, 10000]
    cVals = [20]
    for p, cols in enumerate(cVals):
        csvec_accuracy = np.zeros(len(kVals))
        topHCS_accuracy = np.zeros((len(hVals), len(kVals)))
        for k_i, k in enumerate(kVals):
            d, c, r, numBlocks = len(summed), cols, 15, 1
            #ipdb.set_trace()
            expected = torch.zeros(len(summed), device=device)
            expected[expectedIndices[-k:].to(device)] = summed[
                expectedIndices[-k:].to(device)]

            assert (summed.size() == vecs[0].size())
            w_0 = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=device)
            w_0 += vecs[0]
            print("")
            w_1 = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=device)
            w_1 += vecs[1]

            print("")
            w_2 = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=device)
            w_2 += vecs[2]

            print("")
            w_3 = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=device)
            w_3 += vecs[3]

            print("")
            workers_summed = w_0 + w_1 + w_2 + w_3
示例#14
0
    # init TopHCS stuff
    indexAcc_2 = np.zeros((len(hVals), len(kVals)))
    L1_2 = np.zeros((len(hVals), len(kVals)))
    L2_2 = np.zeros((len(hVals), len(kVals)))

    for k_i, k in enumerate(kVals):
        k = int(k)  #stupid
        expected = torch.zeros(d, device=device)
        expectedIndices = torch.sort(summed**2)[1][-k:]
        expected[expectedIndices.to(device)] = summed[expectedIndices.to(
            device)]

        # CSVecs
        workers = []
        for vec in vecs:
            w = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=device)
            w += vec
            workers.append(w)

        # Summing CSVecs into 1st worker to save memory
        for w in workers[1:]:
            workers[0] += w
        recovered_1 = workers[0].unSketch(k)
        indexAcc_1[k_i] = (expected[expectedIndices] *
                           recovered_1[expectedIndices]).nonzero().numel() / k

        diff = recovered_1[recovered_1 != 0] - expected[recovered_1 != 0]
        L1_1[k_i] = torch.median(torch.abs(diff))
        L2_1[k_i] = torch.median(diff**2)

        for h_i, hVal in enumerate(hVals):
    def __init__(self,
                 opt,
                 c,
                 r,
                 numWorkers,
                 numBlocks=1,
                 doTrueTopk=False,
                 doLocalTopk=False,
                 doRandomK=False):
        """SketchedSum constructor

        Args:
            opt: an instance of torch.optim.SGD whose momentum and weight
                 decay we want to emulate
            c: number of columns in the sketch
            r: numbers of rows in the sketch
            numWorkers: how many workers to divide the gradient
                        computation among
            numBlocks: memory optimization for the sketch (higher means
                       less memory used, but randomness becomes correlated)
            doTrueTopk: instead of sketching, compute the true topk
                        of the sum of the workers' gradients
            doLocalTopk: instead of sketching, send and then sum the local
                         topk of each worker's v vector
            doRandomK: instead of sketching, send a random set of
                       k coordinates
        """
        self.opt = opt
        self.c = c
        self.r = r
        self.numWorkers = numWorkers
        # at most one of true topk, local topk, and random k allowed
        # (what can I say -- I don't believe in implicit casting?)
        assert (((1 if doTrueTopk else 0) + (1 if doLocalTopk else 0) +
                 (1 if doRandomK else 0)) <= 1)
        self.doTrueTopk = doTrueTopk
        self.doLocalTopk = doLocalTopk
        self.doRandomK = doRandomK
        self.doSketching = not (doTrueTopk or doLocalTopk or doRandomK)

        # used for debugging
        self._doSlowSketching = False

        # self.modelDevice is not tested... not sure what happens if
        # the model is on the CPU
        if opt.param_groups[0]["params"][0].is_cuda:
            self.modelDevice = "cuda"
        else:
            self.modelDevice = "cpu"
        self.device = self.modelDevice
        print("device", self.device)

        D = 0
        sketchMask = []
        for group in opt.param_groups:
            for p in group["params"]:
                if p.requires_grad:
                    size = np.prod(p.data.shape)
                    if p.do_sketching:
                        sketchMask.append(torch.ones(size))
                    else:
                        sketchMask.append(torch.zeros(size))
                    D += size
        self.D = D
        # a mask indicating which gradient elements we should sketch
        # and which we should send without compression (e.g. bias terms,
        # maybe early layers, etc.)
        self.sketchMask = torch.cat(sketchMask).byte().to(self.device)

        print("D: {}".format(D))
        print("sketchMask.sum(): {}".format(self.sketchMask.sum()))

        self.us = [
            torch.zeros(D, device=self.device) for _ in range(numWorkers)
        ]
        self.vs = [
            torch.zeros(D, device=self.device) for _ in range(numWorkers)
        ]

        # don't need sketches for true/local/random topk
        if self.doSketching:
            print("making sketches")
            # dimensionality of the sketch (d) is the number of gradient
            # elements that we're going to sketch, i.e. sketchMask.sum()
            self.workerSketches = [
                CSVec(d=self.sketchMask.sum().item(),
                      c=c,
                      r=r,
                      device=self.device,
                      numBlocks=numBlocks) for _ in range(numWorkers)
            ]
        else:
            print("not making sketches")
示例#16
0
        sampler.sample_iid(comm)

    # Instantiate Model
    model = flnet_init()

    # Set compressibility writer
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    c_log_dir = 'logs/' + base_model + '/' + sample + '/' + fbk_status + '/fetchsgd_k' + str(
        k) + "_r" + str(r) + "_c_" + str(c) + '_' + str(rank) + '_alpha' + str(
            alpha) + '/' + current_time + '/sparse_pt'
    c_summary_writer = tf.summary.create_file_writer(c_log_dir)

    d = np.sum([np.prod(v.get_shape()) for v in model.trainable_weights])

    # Send sketch class to clients
    S = CSVec(d, c, r)
    for n in range(1, N + 1):
        comm.send([S.buckets, S.signs], dest=n, tag=11)

    # Residual sketch
    S_e = CSVec(d, c, r, doInitialize=False)
    S_e.buckets, S_e.signs = S.buckets, S.signs
    S_e.table = torch.zeros((r, c))

    for epoch in range(num_epoch):
        if sample == "non_iid2":
            sampler.sample_noniid(2, comm)

        # dummy dataset to get number of steps
        dset = tf.data.Dataset.from_tensor_slices(
            (sampler.x_train_local, sampler.y_train_local))
示例#17
0
	def __init__(self, d, c, r, h, numBlocks, device='cpu'): 
		self.h, self.d = h, d 
		self.device = device
		self.topH = torch.zeros(d, dtype=torch.float, device=self.device)
		self.csvec = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=self.device)
示例#18
0
    hVals = [d]
    cVals = [180000]
    for c, cols in enumerate(cVals):
        csvecAcc = np.zeros(len(kVals))
        topHCSAcc = np.zeros((len(hVals), len(kVals)))
        csvecL2 = np.zeros(len(kVals))
        topHCSL2 = np.zeros((len(hVals), len(kVals)))
        for k_i, k in enumerate(kVals):
            d, c, r, numBlocks = len(summed), cols, 15, 30
            #ipdb.set_trace()
            expected = torch.zeros(len(summed), device=device)
            expected[expectedIndices[-k:].to(device)] = summed[
                expectedIndices[-k:].to(device)]

            assert (summed.size() == vecs[0].size())
            w_0 = CSVec(d=d, c=c, r=r, numBlocks=numBlocks, device=device)
            w_0 += vecs[0]
            print("worker added")

            result = w_0.unSketch(k)
            csvecAcc[k_i] = (expected[expectedIndices] *
                             result[expectedIndices]).nonzero().numel() / k
            print("k = {}".format(k))
            csvecL2[k_i] = (torch.sum((result - expected)**2))**0.5
            for h_i, h in enumerate(hVals):
                result = torch.zeros(len(summed), device=device)
                w_0 = TopHCS(d=d,
                             c=c,
                             r=r,
                             h=h,
                             numBlocks=numBlocks,
示例#19
0
def forward_grad(model, batch, compute_loss, args, compute_grad=True):
    device = args.device

    # divide up batch (for gradient accumulation when memory constrained)
    #num_shards = args.num_train_batch_shards
    # need the max(1, ...) since the last batch in an epoch might be small
    #microbatch_size = max(1, batch[0].size()[0] // num_shards)
    if args.microbatch_size > 0:
        microbatch_size = min(batch[0].size()[0], args.microbatch_size)
    else:
        microbatch_size = batch[0].size()[0]

    # accumulators for the loss & metric values
    accum_loss = 0
    accum_metrics = None

    num_iters = math.ceil(batch[0].size()[0] / microbatch_size)
    for i in range(num_iters):
        # extract current microbatch
        start = i * microbatch_size
        end = (i+1) * microbatch_size
        microbatch = [t[start:end] for t in batch]

        # forward pass
        loss, *metrics = compute_loss(model, microbatch, args)

        # if first time through, we find out how many metrics there are
        if accum_metrics is None:
            accum_metrics = [0 for _ in metrics]

        # accumulate loss & metrics, weighted by how many data points
        # were actually used
        accum_loss += loss.item() * microbatch[0].size()[0]
        for i, m in enumerate(metrics):
            accum_metrics[i] += m.item() * microbatch[0].size()[0]

        # backward pass
        if compute_grad:
            loss.backward()

    # gradient clipping
    if compute_grad and args.max_grad_norm is not None and args.mode not in ["sketch"]:
        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                       args.max_grad_norm * num_iters)

    # "average" here is over the data in the batch
    average_loss = accum_loss / batch[0].size()[0]
    average_metrics = [m / batch[0].size()[0] for m in accum_metrics]

    results = [average_loss] + average_metrics

    if not compute_grad:
        return results

    grad = get_grad(model, args)
    if args.do_dp:
        grad = clip_grad(args.l2_norm_clip, grad)
        if args.dp_mode == "worker":
            noise = torch.normal(mean=0, std=args.noise_multiplier, size=grad.size()).to(args.device)
            noise *= np.sqrt(args.num_workers)
            grad += noise

    # compress the gradient if needed
    if args.mode == "sketch":
        sketch = CSVec(d=args.grad_size, c=args.num_cols,
            r=args.num_rows, device=args.device,
            numBlocks=args.num_blocks)
        sketch.accumulateVec(grad)
        # gradient clipping
        if compute_grad and args.max_grad_norm is not None:
            sketch = clip_grad(args.max_grad_norm, sketch)
        g = sketch.table
    elif args.mode == "true_topk":
        g = grad
    elif args.mode == "local_topk":
        # ideally we'd return the compressed version of the gradient,
        # i.e. _topk(grad, k=args.k). However, for sketching we do momentum
        # in the sketch, whereas for topk we do momentum before taking topk
        # so we have to return an inconsistent quantity here
        g = grad
    elif args.mode == "fedavg":
        # logic for doing fedavg happens in process_batch
        g = grad
    elif args.mode == "uncompressed":
        g = grad

    return g, results
示例#20
0
 def init_sketch(self):
     mdl_cfg = ConfigurationManager().model_config
     return CSVec(d=self.sketch_dim, c=mdl_cfg.col_num, r=mdl_cfg.row_num, numBlocks=mdl_cfg.block_num)