Exemplo n.º 1
0
    def _test_reduce_helper(self, group, group_id, rank, op, master_value, worker_value, expected_value):
        for src in group:
            if rank == src:
                tensor = _build_tensor(src + 1).fill_(master_value)
                dist.reduce(tensor, src, op, group_id)
                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
            else:
                tensor = _build_tensor(src + 1).fill_(worker_value)
                dist.reduce(tensor, src, op, group_id)

        self._barrier()
Exemplo n.º 2
0
    def _test_reduce_helper(self, group, group_id, rank, op, master_value,
                            worker_value, expected_value, cuda=False, rank_to_GPU=None):
        for src in group:
            if rank == src:
                tensor = _build_tensor(src + 1).fill_(master_value)
                if cuda:
                    tensor = tensor.cuda(rank_to_GPU[rank][0])
                dist.reduce(tensor, src, op, group_id)
                self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
            else:
                tensor = _build_tensor(src + 1).fill_(worker_value)
                if cuda:
                    tensor = tensor.cuda(rank_to_GPU[rank][0])
                dist.reduce(tensor, src, op, group_id)

        self._barrier()
Exemplo n.º 3
0
def reduce_loss_dict(loss_dict):
    world_size = get_world_size()

    if world_size < 2:
        return loss_dict

    with torch.no_grad():
        keys = []
        losses = []

        for k in sorted(loss_dict.keys()):
            keys.append(k)
            losses.append(loss_dict[k])

        losses = torch.stack(losses, 0)
        dist.reduce(losses, dst=0)

        if dist.get_rank() == 0:
            losses /= world_size

        reduced_losses = {k: v for k, v in zip(keys, losses)}

    return reduced_losses
def reduce_loss_dict(loss_dict):
    """
    Reduce the loss dictionary from all processes so that process with rank
    0 has the averaged results. Returns a dict with the same fields as
    loss_dict, after reduction.
    """
    world_size = get_world_size()
    if world_size < 2:
        return loss_dict
    with torch.no_grad():
        loss_names = []
        all_losses = []
        for k in sorted(loss_dict.keys()):
            loss_names.append(k)
            all_losses.append(loss_dict[k])
        all_losses = torch.stack(all_losses, dim=0)
        dist.reduce(all_losses, dst=0)
        if dist.get_rank() == 0:
            # only main process gets accumulated, so only divide by
            # world_size in this case
            all_losses /= world_size
        reduced_losses = {k: v for k, v in zip(loss_names, all_losses)}
    return reduced_losses
Exemplo n.º 5
0
def reduce_dict(input_dict, average=True):
    world_size = get_world_size()

    if world_size < 2:
        return input_dict

    with torch.no_grad():
        keys = []
        values = []

        for k in sorted(input_dict.keys()):
            keys.append(k)
            values.append(input_dict[k])

        values = torch.stack(values, 0)
        dist.reduce(values, dst=0)

        if dist.get_rank() == 0 and average:
            values /= world_size

        reduced_dict = {k: v for k, v in zip(keys, values)}

    return reduced_dict
Exemplo n.º 6
0
            def reduce(*_: Any) -> None:
                # Skip gradient reduction, do not alter status flags
                if not self.should_accumulate_grads and self._grad_to_be_reduced[
                        index]:
                    assert param.grad is not None, "Reducing gradients during backward pass, cannot be None"

                    if not self._bucket_flush_callback_set:
                        Variable._execution_engine.queue_callback(
                            self._flush_buckets)
                        self._bucket_flush_callback_set = True

                    # Make sure that this is not fired twice
                    self._grad_to_be_reduced[index] = False
                    param.grad.mul_(self.world_size_scaling)

                    if self.reduce_fp16:
                        param.grad.data = param.grad.data.half()

                    # Future work includes clearing up the buffer if possible
                    def cleanup() -> None:
                        if dst_rank != self.global_rank:
                            param.grad = None
                        else:
                            assert param.grad is not None
                            param.grad.data = param.grad.data.to(
                                dtype=param.dtype)

                    # Async reduce for this buffer, log the future
                    dst_global_rank = OSS.get_global_rank(
                        self.process_group, dst_rank)

                    self._work_handles.append(
                        Workhandle(
                            handle=dist.reduce(tensor=param.grad.data,
                                               dst=dst_global_rank,
                                               group=self.process_group,
                                               async_op=True),
                            callback=cleanup,
                        ))
                    self._reduced_grads += 1

                    # Opportunistically try to empty the queue
                    self._try_consume_work_handle()

                    # If all the reduce operations have been called,
                    # make sure that all the asynchronous calls have concluded before moving on
                    # and execute the delayed actions (release gradients, unroll the buckets)
                    if self._reduced_grads == self._reduced_grads_max:
                        self._consume_work_handles()
def main():
    args = parser.parse_args()
    if not args.cuda:
        args.dist_backend = 'gloo' # nccl doesn't work on CPUs

    dist.init_process_group(backend=args.dist_backend, init_method='env://')
    model = Model() 
    if args.cuda:
        print_status("Using GPU")
        torch.cuda.set_device(args.local_rank)
        model.cuda()
    else:
        print_status("Using CPU")
    
    print_status("initialising DDP model")
    if args.cuda:
        ddp_model = DDP(model, device_ids=[torch.cuda.current_device()])
    else:
        ddp_model = DDP(model)
    
    num_batches = args.batches 
    if not args.weak_scale:
        print_status("Strong scaling")
        num_batches =  num_batches // dist.get_world_size() 
    batch_size = args.batch_size 
    start_time = time.time()
    
    for _ in range(num_batches):
        # create random batch 
        x = torch.randn(batch_size, 1, 100, 100)
        if args.cuda:
            x.cuda() 
        y = ddp_model(x)
        rand_grad = torch.randn_like(y)
        y.backward(rand_grad)

    end_time = time.time() 
    avg_time_tensor = torch.FloatTensor([end_time - start_time])
    min_time_tensor = torch.FloatTensor([end_time - start_time])
    max_time_tensor = torch.FloatTensor([end_time - start_time])
    
    if args.cuda:
        avg_time_tensor = avg_time_tensor.cuda()
        min_time_tensor = min_time_tensor.cuda()
        max_time_tensor = max_time_tensor.cuda()

    dist.reduce(avg_time_tensor, 0, dist.reduce_op.SUM)
    dist.reduce(min_time_tensor, 0, dist.reduce_op.MIN)
    dist.reduce(max_time_tensor, 0, dist.reduce_op.MAX)

    avg_time_tensor /= dist.get_world_size()

    time_min, time_avg, time_max = min_time_tensor.item(), avg_time_tensor.item(), max_time_tensor.item()

    if dist.get_rank() == 0:
        print_status("Time : Min {} Avg {} Max {}".format(time_min, time_avg, time_max))
Exemplo n.º 8
0
def reduce_scatter(tensor,
                   tensor_list,
                   op=ReduceOp.SUM,
                   group=dist.group.WORLD,
                   async_op=False):
    rank = dist.get_rank(group)
    if tensor is None:
        tensor = tensor_list[rank]
    if tensor.dim() == 0:
        tensor = tensor.view(-1)
    tensor[:] = tensor_list[rank]
    ops = []
    for i in range(dist.get_world_size(group)):
        if i == rank:
            tmp = dist.reduce(tensor, rank, op, group, async_op=True)
        else:
            tmp = dist.reduce(tensor_list[i], i, op, group, async_op=True)
        ops.append(tmp)

    oplist = AsyncOpList(ops)
    if async_op:
        return oplist
    else:
        oplist.wait()
Exemplo n.º 9
0
    def _test_reduce_helper(self,
                            group,
                            group_id,
                            rank,
                            op,
                            master_value,
                            worker_value,
                            expected_value,
                            cuda=False):
        for src in group:
            if rank == src:
                tensor = _build_tensor(src + 1).fill_(master_value)
                if cuda:
                    tensor = tensor.cuda()
                dist.reduce(tensor, src, op, group_id)
                self.assertEqual(tensor, _build_tensor(src + 1,
                                                       expected_value))
            else:
                tensor = _build_tensor(src + 1).fill_(worker_value)
                if cuda:
                    tensor = tensor.cuda()
                dist.reduce(tensor, src, op, group_id)

        self._barrier()
    def reduce(self, input, dst, op=ReduceOp.SUM, batched=False):
        """Reduces the input data across all parties."""
        assert dist.is_initialized(), "initialize the communicator first"

        if batched:
            assert isinstance(input, list), "batched reduce input must be a list"
            reqs = []
            result = [x.clone().data for x in input]
            for tensor in result:
                reqs.append(
                    dist.reduce(
                        tensor.data, dst, op=op, group=self.main_group, async_op=True
                    )
                )
            for req in reqs:
                req.wait()
        else:
            assert torch.is_tensor(
                input.data
            ), "unbatched input for reduce must be a torch tensor"
            result = input.clone()
            dist.reduce(result.data, dst, op=op, group=self.main_group)

        return result if dst == self.get_rank() else None
Exemplo n.º 11
0
def test(model, criterion, epoch, test_loader, device):
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0
    loss_acc1 = torch.zeros(2).to(device)
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(test_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            test_loss += loss.item()

            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

        loss_acc1[0] = test_loss / len(test_loader)
        loss_acc1[1] = 100.0 * correct / total
        #print('rank ', dist.get_rank(), ' test loss ', loss_acc1[0].item(),
        #        ' test acc1 ', loss_acc1[1].item())
        dist.reduce(tensor=loss_acc1, dst=0, op=dist.ReduceOp.SUM)
        loss_acc1.div_(dist.get_world_size() * 1.0)
        return loss_acc1[0].item(), loss_acc1[1].item()
Exemplo n.º 12
0
def run():

    modell = model.CNN()
    #modell = model.AlexNet()

    size = dist.get_world_size()
    rank = dist.get_rank()

    group_list = []
    for i in range(size):
        group_list.append(i)
    group = dist.new_group(group_list)

    while(1):

        for param in modell.parameters():
            #for dst in range(1, size):
                #dist.send(param.data, dst=dst)
            dist.broadcast(param.data, src=0, group=group)

        for param in modell.parameters():
            tensor_temp = torch.zeros_like(param.data)
            dist.reduce(tensor_temp, dst=0, op=dist.reduce_op.SUM, group=group)
            param.data = tensor_temp / (size-1)
Exemplo n.º 13
0
def allreduce_instrumented(timer, tensor, group):
    with timer('reduce'):
        rank = dist.get_rank()
        chunks = list(tensor.view(dist.get_world_size(), -1))
        reqs = [
            dist.reduce(chunk,
                        i,
                        op=dist.ReduceOp.SUM,
                        group=group,
                        async_op=True) for i, chunk in enumerate(chunks)
        ]
        [req.wait() for req in reqs]
    with timer('all_gather'):
        chunk = chunks[rank]
        dist.all_gather(chunks, chunk, group=group)
Exemplo n.º 14
0
def valid(args, encoder, fc, validloader, logger):
    with torch.no_grad():
        encoder.eval()
        fc.eval()

        correct = torch.tensor([0.0]).cuda()
        total = torch.tensor([0.0]).cuda()
        for data in validloader:
            img, label = data[:2]
            img, label = img.cuda(), label.cuda()

            feature = encoder(img)
            s = fc(feature)

            # acc
            _, predicted = torch.max(s.data, 1)
            correct += predicted.eq(label.data).sum()
            total += label.size(0)
        dist.reduce(correct, dst=0, op=dist.ReduceOp.SUM)
        dist.reduce(total, dst=0, op=dist.ReduceOp.SUM)
        if args.local_rank == 0:
            logger.info('valid-acc:{:.2%}'.format(correct.cpu().item() /
                                                  total.cpu().item()))
            logger.info('--------------------------')
Exemplo n.º 15
0
def pretrain_validation(args, index, model):
    if args.validation_data_path_prefix is None:
        return

    config = args.config
    logger = args.logger

    logger.info(
        f"Validation micro batch size: {args.train_micro_batch_size_per_gpu}")

    model.eval()
    dataset = PreTrainingDataset(
        args.tokenizer,
        os.path.join(args.validation_data_path_prefix,
                     config['validation']['path']), args.logger,
        args.max_seq_length, index, PretrainDataType.VALIDATION,
        args.max_predictions_per_seq)
    data_batches = get_dataloader(args, dataset, eval_set=True)
    eval_loss = 0
    nb_eval_steps = 0
    for batch in tqdm(data_batches):
        batch = tuple(t.to(args.device) for t in batch)
        tmp_eval_loss = model.network(batch, log=False)
        dist.reduce(tmp_eval_loss, 0)
        # Reduce to get the loss from all the GPU's
        tmp_eval_loss = tmp_eval_loss / dist.get_world_size()
        eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
    eval_loss = eval_loss / nb_eval_steps
    logger.info(f"Validation Loss for epoch {index + 1} is: {eval_loss}")
    if (not args.no_cuda
            and dist.get_rank() == 0) or (args.no_cuda
                                          and args.local_rank == -1):
        args.summary_writer.add_scalar(f'Validation/Loss', eval_loss,
                                       index + 1)
    return
Exemplo n.º 16
0
def run(args, encoder, fc, criterion, optimizer, scheduler, trainloader, validloader, logger):
    # train
    for i_epoch in range(args.max_epoch):
        encoder.train()
        fc.train()
        trainloader.sampler.set_epoch(i_epoch)  # 不可少

        rank = dist.get_rank()
        correct = torch.tensor(0.0).cuda(rank)
        total = torch.tensor(0.0).cuda(rank)
        start_time = torch.tensor(time.time()).cuda(rank)
        for i_iter, data in enumerate(trainloader):
            img, label = data[:2]
            img, label = img.cuda(rank), label.cuda(rank)

            f = encoder(img)
            s = fc(f)
            loss = criterion(s, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            # acc
            _, predicted = torch.max(s.data, 1)
            correct += predicted.eq(label.data).sum()
            total += s.shape[0]
            eta = (time.time() - start_time) / (i_iter + 1) * (len(trainloader) * (
                    args.max_epoch - i_epoch) - i_iter) / 3600

            # print
            dist.reduce(loss, dst=0, op=dist.ReduceOp.SUM)
            dist.reduce(correct, dst=0, op=dist.ReduceOp.SUM)
            dist.reduce(total, dst=0, op=dist.ReduceOp.SUM)
            dist.reduce(eta, dst=0, op=dist.ReduceOp.SUM)
            if rank == 0:
                logger.info('loss:{:.4f} '
                      'acc:{:.2%} '
                      'ETA:{:.2f}h'.format(
                    loss.cpu().item() / args.world_size,
                    correct.cpu().item() / total.cpu().item(),
                    eta.cpu().item() / args.world_size))

        valid(args, encoder, fc, validloader)
Exemplo n.º 17
0
def compute_train_stats_slave(x, y, Z_new, Lambda, cols, num_classes, group):
    train_values, train_indices = torch.max(predict(x, y, Z_new), 1)
    total_accu = accuracy(
        y.view(y.shape[0]).data.cpu().numpy(),
        train_indices.data.cpu().numpy())
    total_cost = Softmax_Fx(x, y, Z_new, Lambda, cols, num_classes)
    total_cost_tensor = total_cost.data.cpu()
    total_accu_tensor = torch.DoubleTensor([total_accu])
    total_len_tensor = torch.DoubleTensor([x.size()[0]])
    dist.reduce(total_cost_tensor, 0, dist.reduce_op.SUM, group)
    dist.reduce(total_accu_tensor, 0, dist.reduce_op.SUM, group)
    dist.reduce(total_len_tensor, 0, dist.reduce_op.SUM, group)
Exemplo n.º 18
0
            def bucket_flush(*_: Any) -> None:
                assert self._bucket_list is not None
                handle = None

                for bucket in self._bucket_list:
                    if not bucket.sent:
                        # Reduce the bucket. Some parameters went unused and this bucket was not flushed
                        bucket.buffer.mul_(self.world_size_scaling)
                        bucket.sent = True
                        handle = dist.reduce(
                            tensor=bucket.buffer,
                            dst=bucket.destination,
                            group=self.process_group,
                            async_op=True,
                        )

                # Only wait on the last handle
                if handle:
                    handle.wait()
Exemplo n.º 19
0
    def _flush_buckets(self) -> None:
        if self._bucket_list is not None:
            last_handle = None
            for bucket in self._bucket_list:
                if not bucket.sent:
                    # Normalize the bucket in one go
                    bucket.buffer.mul_(self.world_size_scaling)

                    # Reduce the bucket
                    last_handle = dist.reduce(
                        tensor=bucket.buffer,
                        dst=bucket.destination,
                        group=self.process_group,
                        async_op=True,
                    )
                    bucket.sent = True

            if last_handle is not None:
                last_handle.wait()
Exemplo n.º 20
0
    def _flush_reduce_calls(self) -> None:
        if self._bucket_list is not None:
            for bucket in self._bucket_list:
                if not bucket.sent:
                    # Normalize the bucket in one go
                    bucket.buffer.mul_(self.world_size_scaling)

                    # Reduce the bucket
                    self._work_handles.append(
                        Workhandle(
                            handle=dist.reduce(
                                tensor=bucket.buffer, dst=bucket.destination, group=self.process_group, async_op=True,
                            ),
                            callback=None,
                        )
                    )
                    bucket.sent = True

        self._consume_work_handles()
Exemplo n.º 21
0
def get_train_stats_master(Lambda, Z_term, cols, num_classes, size, group):
    total_cost_tensor = torch.DoubleTensor([0.0])
    total_accu_tensor = torch.DoubleTensor([0.0])
    total_len_tensor = torch.DoubleTensor([0.0])
    dist.reduce(total_cost_tensor, 0, dist.reduce_op.SUM, group)
    dist.reduce(total_accu_tensor, 0, dist.reduce_op.SUM, group)
    dist.reduce(total_len_tensor, 0, dist.reduce_op.SUM, group)
    total_cost = total_cost_tensor
    train_cost = total_cost + (
        (Lambda / Variable(torch.DoubleTensor([2.0]).cuda())) *
        torch.dot(Z_term.view(cols * (num_classes)),
                  Z_term.view(cols * (num_classes)))).data.cpu()
    train_accu = total_accu_tensor / (size - 1)
    return train_accu, train_cost / total_len_tensor.numpy()[0]
Exemplo n.º 22
0
            def reduce(*_: Any) -> None:
                # Skip gradient reduction, do not alter status flags
                if not self.should_accumulate_grads and self._grad_to_be_reduced[
                        index]:
                    assert param.grad is not None, "Reducing gradients during backward pass, cannot be None"

                    if not self._bucket_flush_callback_set:
                        Variable._execution_engine.queue_callback(
                            self._flush_buckets)
                        self._bucket_flush_callback_set = True

                    # Make sure that this is not fired twice
                    self._grad_to_be_reduced[index] = False
                    bucket = self.buckets[param.device][dst_rank]
                    bucket.params_checked_in += 1

                    if bucket.full():
                        # Normalize the bucket in one go
                        bucket.buffer.mul_(self.world_size_scaling)

                        # Reduce the bucket
                        bucket.sent = True
                        self._work_handles.append(
                            Workhandle(
                                handle=dist.reduce(
                                    tensor=bucket.buffer,
                                    dst=bucket.destination,
                                    group=self.process_group,
                                    async_op=True,
                                ),
                                callback=None,
                            ))
                        self._reduced_grads += 1

                    # Opportunistically try to empty the queue
                    self._try_consume_work_handle()

                    # If all the reduce operations have been called,
                    # make sure that all the asynchronous calls have concluded before moving on
                    # and execute the delayed actions (release gradients, unroll the buckets)
                    if self._reduced_grads == self._reduced_grads_max:
                        self._consume_work_handles()
Exemplo n.º 23
0
            def reduce(*_: Any) -> None:
                # Skip gradient reduction, do not alter status flags
                if not self.should_accumulate_grads and self._grad_to_be_reduced[
                        index]:
                    assert param.grad is not None, "Reducing gradients during backward pass, cannot be None"

                    if not self._bucket_flush_callback_set:
                        Variable._execution_engine.queue_callback(
                            self._flush_reduce_calls)
                        self._bucket_flush_callback_set = True

                    # Make sure that this is not fired twice
                    self._grad_to_be_reduced[index] = False
                    param.grad.mul_(self.world_size_scaling)

                    if self.reduce_fp16:
                        param.grad.data = param.grad.data.half()

                    # Future work includes clearing up the buffer if possible
                    def cleanup() -> None:
                        if dst_rank != self.global_rank:
                            param.grad = None
                        else:
                            assert param.grad is not None
                            param.grad.data = param.grad.data.to(
                                dtype=param.dtype)

                    # Async reduce for this buffer, log the future
                    self._work_handles.append(
                        Workhandle(
                            handle=dist.reduce(
                                tensor=param.grad.data,
                                dst=self._local_to_global_rank[dst_rank],
                                group=self.process_group,
                                async_op=True,
                            ),
                            callback=cleanup,
                        ))

                    # Opportunistically try to empty the queue, free memory
                    self._try_consume_work_handle()
Exemplo n.º 24
0
    def reduce(self, collectiveArgs, retFlag=False):
        if collectiveArgs.reduce_qcomm != 32:
            quantized = _downcast(
                collectiveArgs.ipTensor, collectiveArgs.allreduce_qcomm
            )
        else:
            quantized = collectiveArgs.ipTensor
        retObj = dist.reduce(
            quantized,
            dst=collectiveArgs.dst,
            op=collectiveArgs.op,
            group=collectiveArgs.group,
            async_op=collectiveArgs.asyncOp,
        )  # synchronicity is maintained in runColl
        if collectiveArgs.asyncOp:
            retObj = retObj.get_future().then(_dequantize)
        else:
            retObj = _dequantize(quantized)

        if retFlag:
            return retObj
Exemplo n.º 25
0
        def reduce_bucket(*_: Any) -> None:
            # Skip gradient reduction, do not alter status flags
            if not self.should_accumulate_grads and self._grad_to_be_reduced[
                    index]:
                assert param.grad is not None, "Reducing gradients during backward pass, cannot be None"

                # Make sure that this is not fired twice
                self._grad_to_be_reduced[index] = False

                # Copy to the flat buffer, update the buffer state
                bucket = optimizer.buckets[param.device][dst_rank]

                assert bucket.append(
                    param, use_gradient=True
                ), "Bucket overflow: max %s - current %s - adding %s" % (
                    bucket.max_size,
                    bucket.current_offset,
                    param.grad.numel(),
                )

                if bucket.full():
                    bucket.buffer /= self.world_size

                    optimizer.work_handles.append(
                        Workhandle(
                            handle=dist.reduce(
                                tensor=bucket.buffer,
                                dst=dst_rank,
                                group=self.process_group,
                                async_op=True,
                            ),
                            callback=bucket.unroll,
                        ))

                    # If all the reduce operations have been called, add the gatekeeper
                    if len(optimizer.work_handles
                           ) == optimizer._max_work_handles:
                        gatekeeper()
Exemplo n.º 26
0
            def reduce(*_: Any) -> None:
                # Skip gradient reduction, do not alter status flags

                if not self.should_accumulate_grads and self._grad_to_be_reduced[
                        index]:
                    assert param.grad is not None, "Reducing gradients during backward pass, cannot be None"

                    if not self._bucket_flush_callback_set:
                        Variable._execution_engine.queue_callback(
                            self._flush_reduce_calls)
                        self._bucket_flush_callback_set = True

                    # Make sure that this is not fired twice
                    self._grad_to_be_reduced[index] = False
                    bucket = self.buckets[param.device][dst_rank]
                    bucket.params_checked_in += 1

                    if bucket.all_checked_in:
                        assert bucket.buffer is not None

                        # Normalize the bucket in one go
                        bucket.buffer.mul_(self.world_size_scaling)

                        # Reduce the bucket
                        bucket.sent = True
                        self._work_handles.append(
                            Workhandle(
                                handle=dist.reduce(
                                    tensor=bucket.buffer,
                                    dst=bucket.destination,
                                    group=self.process_group,
                                    async_op=True,
                                ),
                                callback=None,
                            ))

                    # Opportunistically try to empty the queue
                    self._try_consume_work_handle()
Exemplo n.º 27
0
    def _handle_trailing_buckets(self, flush_type: BucketFlush) -> None:
        """
        Go through the buckets, flush them if not already empty
        .. warning: Could be that a bucket flush was already requested, needs to be handled carefully
        """

        for bucket_list in self.buckets.values():
            for bucket in bucket_list:
                if bucket.current_offset > 0:
                    self.work_handles.append(
                        Workhandle(
                            handle=dist.broadcast(
                                tensor=bucket.buffer, src=bucket.global_ref_rank, group=self.group, async_op=True,
                            )
                            if flush_type == BucketFlush.Broadcast
                            else dist.reduce(
                                tensor=bucket.buffer, dst=bucket.global_ref_rank, group=self.group, async_op=True,
                            ),
                            callback=bucket.unroll,
                        )
                    )

        self._consume_work_handles()
Exemplo n.º 28
0
    def reduce(self, collectiveArgs, retFlag=False, pair=False):
        # pair=True mode does not support quantization
        if collectiveArgs.reduce_qcomm != 32 and not pair:
            assert collectiveArgs.ipTensor.dtype == torch.float32
            with paramProfile(
                timer=collectiveArgs.quant_time,
                description="# PARAM: Reduce quantization #",
            ):
                quantized = _downcast(
                    collectiveArgs.ipTensor, collectiveArgs.allreduce_qcomm
                )
        else:
            quantized = (
                collectiveArgs.ipTensor if not pair else collectiveArgs.ipTensor_pair
            )
        retObj = dist.reduce(
            quantized,
            dst=collectiveArgs.srcOrDst,
            op=collectiveArgs.op,
            group=collectiveArgs.group,
            async_op=collectiveArgs.asyncOp,
        )  # synchronicity is maintained in runColl
        if collectiveArgs.reduce_qcomm != 32 and not pair:
            if collectiveArgs.asyncOp:
                retObj = retObj.get_future().then(_dequantize)
            else:
                with paramProfile(
                    timer=collectiveArgs.dequant_time,
                    description="# PARAM: Reduce de-quantization #",
                ):
                    retObj = _dequantize(quantized)

        if collectiveArgs.asyncOp:
            collectiveArgs.waitObj.append(retObj)

        if retFlag:
            return retObj
Exemplo n.º 29
0
def run():
    size = dist.get_world_size()
    rank = dist.get_rank()

    model = Model()
    optimizer = optim.SGD(model.parameters(), lr=LR, momentum=MOMENTUM)
    model.train()
    train_set, samples = create_data_loader()

    start = monotonic()
    for epoch in range(EPOCHS):
        epoch_loss = 0.0
        # for i, (data, target) in enumerate(train_set):
        for data, target in train_set:
            # if (rank == MASTER):
            #     print('Epoch: {}, Minibatch: {}'.format(
            #         epoch + 1, i + 1), end='\r')
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            output = model(data)
            criterion = nn.CrossEntropyLoss()
            loss = criterion(output, target)
            epoch_loss += loss
            loss.backward()
            average_gradients(model)
            optimizer.step()
        # if (rank == MASTER):
        #     print()
    end = monotonic()
    avg_time = torch.Tensor([end - start])
    epoch_loss *= samples / (math.ceil(samples / BATCH_SIZE))
    samples = torch.Tensor([samples])
    dist.reduce(epoch_loss, MASTER, op=dist.reduce_op.SUM)
    dist.reduce(samples, MASTER, op=dist.reduce_op.SUM)
    dist.reduce(avg_time, MASTER, op=dist.reduce_op.SUM)
    if (rank == MASTER):
        # print(float(samples))
        # print(float(epoch_loss))
        epoch_loss = float(epoch_loss) / float(samples)
        avg_time = float(avg_time) / (EPOCHS * size)
        print('{:.4f}, {:.4f}'.format(avg_time, epoch_loss))
Exemplo n.º 30
0
    def accumulate_metric(self, prediction, gt, accumulator, distributed=False):
        hist, correct_pixels, valid_pixels = compute_hist(prediction, gt, self.cfg.MODEL.NET1_CLASSES, 255)

        if distributed:  # gather metric results
            hist = torch.tensor(hist).cuda()
            correct_pixels = torch.tensor(correct_pixels).cuda()
            valid_pixels = torch.tensor(valid_pixels).cuda()

            # aggregate result to rank 0
            dist.reduce(hist, 0, dist.ReduceOp.SUM)
            dist.reduce(correct_pixels, 0, dist.ReduceOp.SUM)
            dist.reduce(valid_pixels, 0, dist.ReduceOp.SUM)

            hist = hist.cpu().numpy()
            correct_pixels = correct_pixels.cpu().item()
            valid_pixels = valid_pixels.cpu().item()

        accumulator['total_hist'] = accumulator.get('total_hist', 0.) + hist
        accumulator['total_correct_pixels'] = accumulator.get('total_correct_pixels', 0.) + correct_pixels
        accumulator['total_valid_pixels'] = accumulator.get('total_valid_pixels', 0.) + valid_pixels
        return accumulator
Exemplo n.º 31
0
def main(args):
    args = options.set_default_args(args)

    if args.ddp_backend == 'apex':
        from apex.parallel import DistributedDataParallel as DDP
    else:
        from torch.nn.parallel import DistributedDataParallel as DDP

    ############################################################################
    # Random seed
    ############################################################################
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    ############################################################################
    # Experiment & Logging
    ############################################################################
    if is_master(args):
        if args.resume:
            # rank-0 device creates experiment dir and log to the file
            logging = get_logger(os.path.join(args.expname, 'log.txt'),
                                 log_=not args.debug)
        else:
            # rank-0 device creates experiment dir and log to the file
            logging = create_exp_dir(args.expname, debug=args.debug)
    else:
        # other devices only log to console (print) but not the file
        logging = get_logger(log_path=None, log_=False)

    args.model_path = os.path.join(args.expname, 'model.pt')
    args.var_path = os.path.join(args.expname, 'var.pt')

    ############################################################################
    # Load data
    ############################################################################
    logging('Loading data..')
    tr_data, va_data = options.load_data(args)

    train_step = 0
    best_eval_ll = -float('inf')
    if args.resume:
        logging('Resuming from {}...'.format(args.resume))
        model, opt = torch.load(args.model_path, map_location='cpu')
        model = model.to(args.device)
        for state in opt.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(args.device)
        best_eval_ll, train_step = torch.load(args.var_path)
    else:
        logging('Building model..')
        if args.model_name in ['srnn', 'srnn_zforce', 'srnn_hier']:
            model = eval(args.model_name).Model(args.n_mix,
                                                args.d_data,
                                                args.d_emb,
                                                args.d_mlp,
                                                args.d_rnn,
                                                args.d_lat,
                                                dropout=args.dropout,
                                                n_layer=args.n_layer)
        elif args.model_name in ['rnn', 'rnn_hier']:
            model = eval(args.model_name).Model(args.n_mix,
                                                args.d_data,
                                                args.d_emb,
                                                args.d_rnn,
                                                dropout=args.dropout,
                                                n_layer=args.n_layer)
        else:
            raise ValueError('unsupported model type {}'.format(
                args.model_name))

        model = model.to(args.device)

        # create new optimizer
        opt = torch.optim.Adam(model.parameters(), lr=args.lr)

    if not args.test_only:
        # criterion params and model params
        crit_params, model_params = [], []
        for n, p in model.named_parameters():
            if 'crit' in n:
                crit_params.append(p)
            else:
                model_params.append(p)

        ############################################################################
        # Distributed Data Parallel
        ############################################################################
        if args.distributed:
            if args.ddp_backend == 'apex':
                torch.cuda.set_device(args.distributed_rank)
                para_model = DDP(model)
            else:
                para_model = DDP(model,
                                 device_ids=[args.device_id],
                                 output_device=args.device_id)
        else:
            para_model = model

        ############################################################################
        # Log args
        ############################################################################
        args.n_crit_param = sum([p.nelement() for p in crit_params])
        args.n_model_param = sum([p.nelement() for p in model_params])
        args.n_param = args.n_crit_param + args.n_model_param
        if is_master(args):
            logging('=' * 100)
            for k, v in args.__dict__.items():
                logging('  - {} : {}'.format(k, v))
            logging('=' * 100)

        ############################################################################
        # Training
        ############################################################################
        # linear cosine annealing
        kld_weight = min(1., args.init_kld + train_step * args.kld_incr)

        loss_sum = torch.Tensor([0]).to(args.device)
        kld_sum = torch.Tensor([0]).to(args.device)
        nll_sum = torch.Tensor([0]).to(args.device)
        gnorm_sum = 0
        t = timeit.default_timer()
        for epoch in range(args.num_epochs):
            model.train()
            # make sure all data iterators use the same seed to shuffle data
            if args.distributed:
                np.random.seed(args.seed + epoch)

            #initalize the hidden state
            if args.pass_h:
                hidden = model.init_hidden(args.batch_size)
            else:
                hidden = None

            for x, y, mask in tr_data.get_masked_iter(shuffle=True):
                opt.zero_grad()
                ratio = 1. / torch.sum(mask)
                if args.kld:
                    nll_loss, kld_loss, hidden = para_model(x,
                                                            y,
                                                            mask=mask,
                                                            hidden=hidden)
                    nll_loss = nll_loss.sum() * ratio
                    kld_loss = kld_loss.sum() * ratio
                    train_loss = nll_loss - kld_loss * kld_weight
                    train_loss.backward()

                    total_loss = nll_loss.detach() - kld_loss.detach()
                    kld_sum += -kld_loss.detach()
                    nll_sum += nll_loss.detach()
                else:
                    nll_loss, hidden = para_model(x,
                                                  y,
                                                  mask=mask,
                                                  hidden=hidden)
                    train_loss = nll_loss.sum() * ratio
                    train_loss.backward()

                    total_loss = train_loss.detach()

                if args.clip > 0:
                    gnorm = nn.utils.clip_grad_norm_(model.parameters(),
                                                     args.clip)
                else:
                    gnorm = 0
                    for n, p in model.named_parameters():
                        param_gnorm = p.grad.data.norm(2)
                        gnorm += param_gnorm.item()**2
                    gnorm = gnorm**(1. / 2)

                opt.step()

                gnorm_sum += gnorm
                loss_sum += total_loss
                train_step += 1

                # lr & kl annealling
                kld_weight = min(1., kld_weight + args.kld_incr)
                adjust_lr(opt, train_step, args.max_step, args.lr, args.end_lr)

                # log training
                if train_step % args.log_interval == 0:
                    if args.distributed:
                        dist.reduce(loss_sum, dst=0, op=dist.ReduceOp.SUM)
                        loss_sum = loss_sum.div_(args.distributed_world_size)
                        dist.reduce(nll_sum, dst=0, op=dist.ReduceOp.SUM)
                        nll_sum = nll_sum.div_(args.distributed_world_size)
                        dist.reduce(kld_sum, dst=0, op=dist.ReduceOp.SUM)
                        kld_sum = kld_sum.div_(args.distributed_world_size)

                    if is_master(args):
                        cur_loss = loss_sum.item() / args.log_interval
                        cur_nll = nll_sum.item() / args.log_interval
                        cur_kld = kld_sum.item() / args.log_interval
                        elapsed = (timeit.default_timer() - t) / 3600
                        logging('| total hrs [{:.2f}] | epoch {} step {} ' \
                                '| lr {:8.6f}, klw {:7.5f} | LL {:>9.4f} ' \
                                '| nll_loss {:>7.4f}, kld_loss {:>8.4f} ' \
                                '| gnorm {:.4f}'.format(
                          elapsed, epoch, train_step, opt.param_groups[0]['lr'],
                          kld_weight, -cur_loss, cur_nll, cur_kld,
                          gnorm_sum / args.log_interval))

                    loss_sum = torch.Tensor([0]).to(args.device)
                    kld_sum = torch.Tensor([0]).to(args.device)
                    nll_sum = torch.Tensor([0]).to(args.device)
                    gnorm_sum = 0

                # validation
                if train_step % args.eval_interval == 0:
                    eval_ll = evaluate(va_data, model, args)
                    if is_master(args):
                        logging('-' * 120)
                        logging('Eval [{}] at step: {} | valid LL: {:>8.4f}'.
                                format(train_step // args.eval_interval,
                                       train_step, eval_ll))
                        if eval_ll > best_eval_ll:
                            best_eval_ll = eval_ll
                            if not args.debug:
                                logging('Save checkpoint. ' \
                                        'Best valid LL {:>9.4f}'.format(eval_ll))
                                torch.save([model, opt], args.model_path)
                                torch.save([best_eval_ll, train_step],
                                           args.var_path)
                        logging('-' * 120)

                # Reach maximum training step
                if train_step == args.max_step:
                    break
            if train_step == args.max_step:
                break

    eval_ll = evaluate(va_data, model, args)
    if is_master(args):
        logging('-' * 120)
        logging('Eval [{}] | step: {}, LL: {:>8.4f}'.format(
            train_step // args.eval_interval, train_step, eval_ll))
        logging('-' * 120)

    # evaluate the current model
    test_loss = evaluate(te_data, model, args)
    if is_master(args):
        logging('Test -- LL: {:>8.4f}'.format(test_loss))
Exemplo n.º 32
0
def train():
    #Launch recv td
    print("worker_id(rank)", worker_id, "  size:", str(worker_num),
          " batch_size=", batch_size)
    init_processes(worker_id, worker_num, 'gloo')

    print("Worker End Connection Initialized")
    global sub_net, sub_optimizer, device
    is_cpu_mode = False
    sub_net.train()
    inputs = None
    outputs = None
    train_loss = 0
    correct = 0
    total = 0
    iteration_num = 100
    iter_n = 0
    loss = None
    sub_optimizer.zero_grad()
    sta = time.time()
    #with torch.autograd.profiler.emit_nvtx():
    while iter_n <= iteration_num:
        inputs = fake_input.to(device)
        targets = fake_target.to(device)
        outputs = sub_net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        comm_time_sta = time.time()
        para_num = 0
        ps_id = 0
        for name, parameters in sub_net.named_parameters():
            if (parameters.grad is not None):
                grad_content = parameters.grad.to("cpu")
                para_num += grad_content.numel()
                dist.reduce(tensor=grad_content,
                            dst=ps_id,
                            op=dist.ReduceOp.SUM)
                if worker_id == ps_id:
                    grad_content = grad_content / worker_num
                dist.broadcast(tensor=grad_content, src=ps_id)
                parameters.grad = grad_content.to(device)
                ps_id = (ps_id + 1) % worker_num
        comm_time_ed = time.time()
        sub_optimizer.step()
        sub_optimizer.zero_grad()
        #print("iter=",iter_n," comm_time=",str(comm_time_ed-comm_time_ed))
        iter_n = iter_n + 1

        if iter_n % 10 == 0:
            ed = time.time()
            print("iter_n=", iter_n, " time=", (ed - sta * 1.0), "comm_num=",
                  para_num)
        if (iter_n > 0 and iter_n % 10 == 0):
            cpu_node_idx = (iter_n / 10) % worker_num
            if worker_id == cpu_node_idx and is_cpu_mode == False:
                print("switch to cpu")
                os.environ["CUDA_VISIBLE_DEVICES"] = ''
                device = 'cpu'
                sub_net = sub_net.to(device)
                sub_optimizer = optim.SGD(sub_net.parameters(),
                                          lr=args.lr,
                                          momentum=0.9,
                                          weight_decay=5e-4)
                is_cpu_mode = True
            elif (not worker_id == cpu_node_idx) and is_cpu_mode == True:
                print("switch to cuda")
                os.environ["CUDA_VISIBLE_DEVICES"] = '1'
                device = 'cuda'
                sub_net = sub_net.to(device)
                sub_optimizer = optim.SGD(sub_net.parameters(),
                                          lr=args.lr,
                                          momentum=0.9,
                                          weight_decay=5e-4)
                is_cpu_mode = False
        if iter_n == iteration_num:
            exit(0)
Exemplo n.º 33
0
elif rank == 1:
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            for i in range(0, num_tensors):
                dist.recv(tensor, 0)
dist.barrier()

if rank == 0:
    print_header("reduce")
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            start = timer()
            for i in range(0, num_tensors):
                dist.reduce(tensor, 0)
            end = timer()
            print_stats(bytes, num_tensors, end - start)
    print()
else:
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            for i in range(0, num_tensors):
                dist.reduce(tensor, 0)
dist.barrier()

if rank == 0:
    print_header("all reduce")
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)