def __init__(self,
                 args,
                 model,
                 criterion,
                 device_ids=None,
                 multiprocessing_method='spawn'):
        if device_ids is None:
            device_ids = tuple(range(torch.cuda.device_count()))
        super().__init__(device_ids, multiprocessing_method)

        if not torch.cuda.is_available():
            raise NotImplementedError('Training on CPU is not supported')
        model = model.share_memory()
        nccl_uid = nccl.get_unique_id()
        self.criterion = criterion

        Future.gen_list([
            self.call_async(rank,
                            '_async_init',
                            args=args,
                            model=model,
                            criterion=criterion,
                            nccl_uid=nccl_uid)
            for rank in range(self.num_replicas)
        ])

        self._grads_initialized = False
Exemplo n.º 2
0
    def __init__(self,
                 args,
                 model,
                 device_ids=None,
                 multiprocessing_method='spawn',
                 src_dict=None,
                 dst_dict=None):
        if device_ids is None:
            device_ids = tuple(range(torch.cuda.device_count()))
        super().__init__(device_ids, multiprocessing_method)

        if not torch.cuda.is_available():
            raise NotImplementedError('Training on CPU is not supported')
        model = model.share_memory()
        nccl_uid = nccl.get_unique_id()

        Future.gen_list([
            self.call_async(rank,
                            '_async_init',
                            args=args,
                            model=model,
                            src_dict=src_dict,
                            dst_dict=dst_dict,
                            nccl_uid=nccl_uid)
            for rank in range(self.num_replicas)
        ])

        self.enable_rl = args.enable_rl
        self.args = args