Exemplo n.º 1
0
    def _test_all_reduce_helper(
        self,
        group,
        group_id,
        rank,
        op,
        master_value,
        worker_value,
        expected_value,
        cuda=False,
        rank_to_GPU=None,
    ):
        for src in group:
            if rank == src:
                tensor = _build_tensor(src + 1).fill_(master_value)
                if cuda:
                    tensor = tensor.cuda(rank_to_GPU[rank][0])
                dist.all_reduce(tensor, op, group_id)
                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
            else:
                tensor = _build_tensor(src + 1).fill_(worker_value)
                if cuda:
                    tensor = tensor.cuda(rank_to_GPU[rank][0])
                dist.all_reduce(tensor, op, group_id)
                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))

        self._barrier()
Exemplo n.º 2
0
        def _process_batch():
            dev_grad_batch, dev_events, job_event = queue.get()
            dev_coalesced = []
            # Coalesce the tensors on all devices and start a local reduction
            for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams):
                with torch.cuda.device(dev_id), torch.cuda.stream(stream):
                    stream.wait_event(event)
                    coalesced = _flatten_dense_tensors(grad_batch)
                    dev_coalesced.append(coalesced)
            # Wait for all copies to complete before starting the NCCL kernel
            for stream in reduction_streams:
                stream.synchronize()
            nccl.reduce(dev_coalesced, root=0, streams=nccl_streams)

            # From now on we're only going to work on the first device (from device_ids)
            grad_batch = dev_grad_batch[0]
            coalesced = dev_coalesced[0]
            reduce_stream = reduction_streams[0]
            with torch.cuda.stream(reduce_stream):
                reduce_stream.wait_stream(nccl_streams[0])
                coalesced /= dist.get_world_size()
                dist.all_reduce(coalesced, group=group_id)
                for grad, reduced in zip(grad_batch, _unflatten_dense_tensors(coalesced, grad_batch)):
                    grad.copy_(reduced)
            job_event.set()
    def sync(self, iter_id):
        if iter_id % self.sync_frequency == 0:
            for p in self.model.parameters():
                dist.all_reduce(p.data,
                                op=dist.reduce_op.SUM,
                                group=self.group)
                p.data /= self.world_size

            if iter_id % self.evaluate_frequency == 0:
                # current for simplicity the full model is transmitted, but it can be compressed later
                self.update_frozen_lengths()
                self.last_model_copy = copy.deepcopy(self.model).cuda()

                # update round id and defrozen corresponded parameters
                self.round_id += 1
                self.synchronization_mask = torch.where(
                    self.defrozen_round_ids == self.round_id,
                    torch.tensor(1).cuda().byte(),
                    torch.tensor(0).cuda().byte())
                stable_ratio = 1 - float(sum(
                    self.synchronization_mask.int())) / self.model_size

                # adjust the synchronization frequency when necessary
                # print self.synchronization_mask
                print 'at iteration: ', iter_id, '; round id: ', self.round_id, '; stable ratio: ', stable_ratio
                return True
        return False
Exemplo n.º 4
0
def reduce_loss_dict(loss_dict):
    """
    Reduce the loss dictionary from all processes so that process with rank
    0 has the averaged results. Returns a dict with the same fields as
    loss_dict, after reduction.
    """
    world_size = get_world_size()
    if world_size < 2:
        return loss_dict
    with torch.no_grad():
        loss_names = []
        all_losses = []
        for k, v in loss_dict.items():
            loss_names.append(k)
            all_losses.append(v)
        all_losses = torch.stack(all_losses, dim=0)
        # dist.reduce(all_losses, dst=0)
        dist.all_reduce(all_losses)
        # if dist.get_rank() == 0:
        if True:
            # only main process gets accumulated, so only divide by
            # world_size in this case
            all_losses /= world_size
        reduced_losses = {k: v for k, v in zip(loss_names, all_losses)}
    return reduced_losses
Exemplo n.º 5
0
 def after_sync(self, model, iter_id):
     if iter_id % self.sync_frequency == 0:
         for p in model.parameters():
             dist.all_reduce(p.data,
                             op=dist.reduce_op.SUM,
                             group=self.group)
             p.data /= self.world_size
 def sync(self, iter_id):
     if self.phase > 0:
         for p in self.model.parameters():
             dist.all_reduce(p.data, op=dist.reduce_op.SUM, group=self.group)
             p.data /= self.world_size
     elif iter_id % self.sync_frequency == 0:
         for p in self.model.parameters():
             dist.all_reduce(p.data, op=dist.reduce_op.SUM, group=self.group)
             p.data /= self.world_size
         self.update_synchronization_mask()
         self.last_model_copy = copy.deepcopy(self.model)
Exemplo n.º 7
0
def all_reduce(tensor, group=None, op=SUM_OP):
    if group is None:
        group = get_default_group()
    if _use_c10d[0]:
        return dist_c10d.all_reduce(tensor, op=op['c10d'], group=group)
    else:
        return dist_no_c10d.all_reduce(tensor, op=op['no_c10d'], group=group)
Exemplo n.º 8
0
def all_reduce(tensor, group=None):
    if group is None:
        group = get_default_group()
    if _use_c10d[0]:
        return dist_c10d.all_reduce(tensor, group=group)
    else:
        return dist_no_c10d.all_reduce(tensor, group=group)
Exemplo n.º 9
0
        def allreduce_params():
            if self.needs_reduction:
                self.needs_reduction = False
                buckets = defaultdict(list)
                for param in self.module.parameters():
                    if param.requires_grad and param.grad is not None:
                        tp = type(param.data)
                        buckets[tp].append(param)

                for bucket in buckets.values():
                    grads = [param.grad.data for param in bucket]
                    coalesced = _flatten_dense_tensors(grads)
                    dist.all_reduce(coalesced)
                    coalesced /= dist.get_world_size()
                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)
Exemplo n.º 10
0
    def sync(self, model, iter_id):
        if self.phase > 0:
            for p in model.parameters():
                dist.all_reduce(p.grad, op=dist.reduce_op.SUM, group=self.group)
                p.grad /= self.world_size
            return

        grad = [p.grad for p in model.parameters()]
        flattened_grad = torch.tensor([])
        for g in grad:
            frag = g.view(-1)
            flattened_grad = torch.cat((flattened_grad, frag),0)
#        flattened_grad = torch.cat([p.grad.data.view(-1) for p in model.parameters()], 0)

        # filter gradient with mask
        filtered_grad = flattened_grad * self.synchronization_mask.float()
        self.local_ac_grad += filtered_grad
        valid_grad = filtered_grad

        if iter_id % self.sync_frequency == 0:
            print 'sync now:', iter_id
            # squeeze those parameters to be communicated into one tensor
            transmitted_grad = torch.masked_select(self.local_ac_grad, self.synchronization_mask)
            dist.all_reduce(transmitted_grad, op=dist.reduce_op.SUM, group=self.group)
            transmitted_grad /= self.world_size

            # unsqueeze transmitted grad to full length flattened grad
            self.global_ac_grad = torch.zeros(self.flattened_shape) 
            self.global_ac_grad[self.synchronization_mask] = transmitted_grad
            print 'global_ac_grad[0:3]:', self.global_ac_grad[0:3]
            print '5081: ',self.global_ac_grad[5081]

            # update synchronization mask & prepare gradient
            self.update_synchronization_mask()
            valid_grad = filtered_grad + self.global_ac_grad - self.local_ac_grad
            self.local_ac_grad = torch.zeros(self.flattened_shape)

        # unwrap to high-dimension gradient
        for i, p in enumerate(model.parameters()):
            p.grad.data = valid_grad[self.frag_index_list[i][0]:self.frag_index_list[i][1]].view(self.frag_shape_list[i])
#        model.parameters.grad.data
        if sum(self.synchronization_mask) == 0:
            self.phase = 1
Exemplo n.º 11
0
    def sync(self, iter_id):
        if self.phase > 0:
            for p in self.model.parameters():
                dist.all_reduce(p.data,
                                op=dist.reduce_op.SUM,
                                group=self.group)
                p.data /= self.world_size
        elif iter_id == self.next_sync_iter_id:
            # current for simplicity the full model is transmitted, but it can be compressed later
            for p in self.model.parameters():
                dist.all_reduce(p.data,
                                op=dist.reduce_op.SUM,
                                group=self.group)
                p.data /= self.world_size

            self.update_frozen_lengths()
            self.last_model_copy = copy.deepcopy(self.model)

            # update round id and defrozen corresponded parameters
            self.round_id += 1
            self.synchronization_mask = torch.where(
                self.defrozen_round_ids == self.round_id,
                torch.tensor(1).byte(),
                torch.tensor(0).byte())

            # adjust the synchronization frequency when necessary
            stable_ratio = 1 - float(sum(
                self.synchronization_mask.int())) / self.model_size
            #            print self.synchronization_mask
            if stable_ratio > self.comp_comm_ratio:
                self.sync_frequency = (self.sync_frequency + 1) / 2
            else:
                # if few parameters are stable, we shall reduce the synchronization frequency
                self.sync_frequency += self.change_frequency_step
            self.next_sync_iter_id += self.sync_frequency
            print 'at iteration: ', iter_id, '; stable ratio: ', stable_ratio, '; new synchronization frequency: ', self.sync_frequency
            return True
        return False
Exemplo n.º 12
0
def all_reduce(model, world_size, group):
    for param in model.parameters():
        dist.all_reduce(param.data, op=dist.reduce_op.SUM, group=group)
        param.data /= world_size
    return model
Exemplo n.º 13
0
    def sync(self, model, iter_id):
        if self.phase > 0:
            for p in model.parameters():
                dist.all_reduce(p.grad,
                                op=dist.reduce_op.SUM,
                                group=self.group)
                p.grad /= self.world_size
            return

        grad = [p.grad for p in model.parameters()]
        flattened_grad = torch.tensor([])
        for g in grad:
            frag = g.view(-1)
            flattened_grad = torch.cat((flattened_grad, frag), 0)
#        flattened_grad = torch.cat([p.grad.data.view(-1) for p in model.parameters()], 0)

# filter gradient with mask
#filtered_grad = flattened_grad * self.synchronization_mask.float()
        filtered_grad = torch.zeros(self.flattened_shape)
        #        filtered_grad[self.synchronization_mask] = flat
        #        filtered_grad = torch.where(self.synchronization_mask > 0, flattened_grad, filtered_grad)
        filtered_grad = torch.where(self.fixed_synchronization_mask > 0,
                                    flattened_grad, filtered_grad)
        self.local_ac_grad += filtered_grad
        valid_grad = filtered_grad

        if iter_id % self.sync_frequency == 0:
            print 'sync now:', iter_id, '; phase:', self.phase
            # squeeze those parameters to be communicated into one tensor
            #            transmitted_grad = torch.masked_select(self.local_ac_grad, self.synchronization_mask)
            transmitted_grad = torch.masked_select(
                self.local_ac_grad, self.fixed_synchronization_mask)
            dist.all_reduce(transmitted_grad,
                            op=dist.reduce_op.SUM,
                            group=self.group)
            #            numpy.save(str(iter_id)+'-tm_grad', transmitted_grad.detach().numpy())
            transmitted_grad /= self.world_size

            # unsqueeze transmitted grad to full length flattened grad
            self.global_ac_grad = torch.zeros(self.flattened_shape)
            #            self.global_ac_grad[self.synchronization_mask] = transmitted_grad
            self.global_ac_grad[
                self.fixed_synchronization_mask] = transmitted_grad
            print 'global_ac_grad[0:3]:', self.global_ac_grad[0:3]
            print '5081: ', self.global_ac_grad[5081]

            # update synchronization mask & prepare gradient
            self.update_synchronization_mask()
            valid_grad = filtered_grad + self.global_ac_grad - self.local_ac_grad
            #            numpy.save(str(iter_id)+'-gg', self.global_ac_grad.detach().numpy())
            #            numpy.save(str(iter_id)+'-lg', self.local_ac_grad.detach().numpy())
            #            numpy.save(str(iter_id)+'-fg', filtered_grad.detach().numpy())
            #            numpy.save(str(iter_id)+'-vg', valid_grad.detach().numpy())
            #            numpy.save(str(iter_id)+'-model', numpy.asarray([i.detach().numpy() for i in list(model.parameters())]))
            self.local_ac_grad = torch.zeros(self.flattened_shape)
            #            exit()
            #            numpy.save(str(iter_id)+'-model-y', numpy.asarray([i.detach().numpy() for i in list(model.parameters())]))

            if (iter_id / self.sync_frequency) % 50 == 0:
                self.synchronization_mask = torch.ones(
                    self.flattened_shape).byte()

        # unwrap to high-dimension gradient
        for i, p in enumerate(model.parameters()):
            p.grad.data = valid_grad[self.frag_index_list[i][0]:self.
                                     frag_index_list[i][1]].view(
                                         self.frag_shape_list[i])


#        model.parameters.grad.data

        if iter_id % self.sync_frequency == 0:
            print 'stable ratio: ', 1 - float(
                sum(self.synchronization_mask.int())) / self.flattened_shape[0]
            if self.phase == 0 and float(sum(self.synchronization_mask.int(
            ))) / self.flattened_shape[0] < self.change_phase_threshold:
                self.phase = 0
                return True
        return False