class CollectiveCommunicator(object): def __init__(self): if _FTLIB_INSTALLED: self._ftlib = BasicFTLib() self._ftlib.init(consensus="gossip", commlib="pytorch") else: logger.warning( "FTLib is not installed. The CollectiveCommunicator " "may not work as expected") self._ftlib = None def allreduce(self, data, op="MEAN"): if data is None: logger.error("Data is required for allreduce operation") return CollectiveCommunicatorStatus.FAILED, data if op not in _SUPPORTED_ALLREDUCE_OPS: logger.error( "%s is not in list of supported allreduce operations: %s" % (op, _SUPPORTED_ALLREDUCE_OPS)) return CollectiveCommunicatorStatus.FAILED, data if self._ftlib is not None: res = self._ftlib.allreduce_average(data) if res == FTAllReduceStatus.SUCCESS: return CollectiveCommunicatorStatus.SUCCEEDED, data else: return CollectiveCommunicatorStatus.FAILED, data else: logger.warning("FTLib is not installed. " "Default to succeeded for testing purposes") return CollectiveCommunicatorStatus.SUCCEEDED, data def broadcast(self, data, root_ip): if self._ftlib is not None: res = self._ftlib.broadcast(data, root_ip) if res == FTAllReduceStatus.SUCCESS: return CollectiveCommunicatorStatus.SUCCEEDED, data else: return CollectiveCommunicatorStatus.FAILED, data else: logger.warning("FTLib is not installed. " "Default to succeeded for testing purposes") return CollectiveCommunicatorStatus.SUCCEEDED, data def barrier(self): return CollectiveCommunicatorStatus.SUCCEEDED def has_new_worker_joining(self): return True
x = self.fc1(x) x = F.relu(x) x = self.dropout2(x) x = self.fc2(x) output = F.log_softmax(x, dim=1) return output if __name__ == '__main__': logging.info("start!") epochs = 1 # initialize the fault-tolerant library with consensus and framework options ftlib = BasicFTLib() ftlib.init(consensus='shared_storage', framework='pytorch') use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=1.0) scheduler = StepLR(optimizer, step_size=1, gamma=0.7) for epoch in range(1, epochs + 1): model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) time.sleep(0.5)
def dummy_update(): logging.info("dummy update") time.sleep(0.5) if __name__ == '__main__': logging.info("start!") epochs = 1 dl = dummy_dataloader(10) # initialize the fault-tolerant library with consensus and framework options ftlib = BasicFTLib() ftlib.init(consensus='shared_storage', framework='dummy_NCCL') for _ in range(epochs): for batch in dl: dummy_forward() dummy_backward() if ftlib.skip_allreduce: logging.info("skip allreduce") dummy_update() continue else: res = ftlib.wait_weights_ready() if res == FTAllReduceStatus.NO_NEED: logging.critical( "cannot use average_gradient when there is no need")