예제 #1
0
 def __init__(self):
     if _FTLIB_INSTALLED:
         self._ftlib = BasicFTLib()
         self._ftlib.init(consensus="shared_storage",
                          framework="dummy_NCCL")
     else:
         self._ftlib = None
예제 #2
0
    def __init__(self, service_name=None):
        if _FTLIB_INSTALLED:
            connection_try_num = 0
            while True:
                try:
                    peer_list = list(self._get_peer_set(service_name))
                except Exception:
                    if (connection_try_num * 5 >
                            _FTLIB_CONSENSUS_CONNECTION_TIMEOUT_SECS):
                        logger.error(
                            "Cannot connect to FTLib consensus service in %s "
                            "seconds",
                            str(_FTLIB_CONSENSUS_CONNECTION_TIMEOUT_SECS),
                        )
                        self._ftlib = None
                        return
                    # sleep for 5s and try again
                    logger.warning(
                        "Cannot connect to FTLib consensus service, "
                        "trying again.")
                    connection_try_num += 1
                    time.sleep(5)
                else:
                    break

            self._ftlib = BasicFTLib(
                consensus="gossip",
                commlib="pytorch",
                consensus_init_kwargs={
                    "known_addr_list": peer_list,
                    "custom_bind_addr":
                    socket.gethostbyname(socket.gethostname()),
                },
            )
            connection_try_num = 0
            while peer_list and not self._ftlib.consensus_joined():
                logger.warning("Retry building consensus...")
                try:
                    self._ftlib.manual_join(
                        known_addr_list=list(self._get_peer_set(service_name)))
                except Exception:
                    if (connection_try_num * 5 >
                            _FTLIB_CONSENSUS_CONNECTION_TIMEOUT_SECS):
                        logger.error(
                            "Cannot join FTLib consensus service in %s "
                            "seconds",
                            str(_FTLIB_CONSENSUS_CONNECTION_TIMEOUT_SECS),
                        )
                        self._ftlib = None
                        return
                    logger.warning("Cannot join FTLib consensus service, "
                                   "trying again.")
                    connection_try_num += 1
                    time.sleep(5)
        else:
            logger.warning(
                "FTLib is not installed. The CollectiveCommunicator "
                "may not work as expected")
            self._ftlib = None
예제 #3
0
 def __init__(self):
     if _FTLIB_INSTALLED:
         self._ftlib = BasicFTLib()
         self._ftlib.init(consensus="gossip", commlib="pytorch")
     else:
         logger.warning(
             "FTLib is not installed. The CollectiveCommunicator "
             "may not work as expected")
         self._ftlib = None
예제 #4
0
class CollectiveCommunicator(object):
    def __init__(self):
        if _FTLIB_INSTALLED:
            self._ftlib = BasicFTLib()
            self._ftlib.init(consensus="gossip", commlib="pytorch")
        else:
            logger.warning(
                "FTLib is not installed. The CollectiveCommunicator "
                "may not work as expected")
            self._ftlib = None

    def allreduce(self, data, op="MEAN"):
        if data is None:
            logger.error("Data is required for allreduce operation")
            return CollectiveCommunicatorStatus.FAILED, data
        if op not in _SUPPORTED_ALLREDUCE_OPS:
            logger.error(
                "%s is not in list of supported allreduce operations: %s" %
                (op, _SUPPORTED_ALLREDUCE_OPS))
            return CollectiveCommunicatorStatus.FAILED, data
        if self._ftlib is not None:
            res = self._ftlib.allreduce_average(data)
            if res == FTAllReduceStatus.SUCCESS:
                return CollectiveCommunicatorStatus.SUCCEEDED, data
            else:
                return CollectiveCommunicatorStatus.FAILED, data
        else:
            logger.warning("FTLib is not installed. "
                           "Default to succeeded for testing purposes")
            return CollectiveCommunicatorStatus.SUCCEEDED, data

    def broadcast(self, data, root_ip):
        if self._ftlib is not None:
            res = self._ftlib.broadcast(data, root_ip)
            if res == FTAllReduceStatus.SUCCESS:
                return CollectiveCommunicatorStatus.SUCCEEDED, data
            else:
                return CollectiveCommunicatorStatus.FAILED, data
        else:
            logger.warning("FTLib is not installed. "
                           "Default to succeeded for testing purposes")
            return CollectiveCommunicatorStatus.SUCCEEDED, data

    def barrier(self):
        return CollectiveCommunicatorStatus.SUCCEEDED

    def has_new_worker_joining(self):
        return True
예제 #5
0
 def __init__(self, service_name=None):
     if _FTLIB_INSTALLED:
         self._ftlib = BasicFTLib(
             consensus="gossip",
             commlib="pytorch",
             consensus_init_kwargs={
                 "known_addr_list": list(self._get_peer_set(service_name)),
                 "custom_bind_addr":
                 socket.gethostbyname(socket.gethostname()),
             },
         )
         while not self._ftlib.consensus_joined():
             logger.warning("Retry building consensus...")
             self._ftlib.manual_join(
                 known_addr_list=list(self._get_peer_set(service_name)))
     else:
         logger.warning(
             "FTLib is not installed. The CollectiveCommunicator "
             "may not work as expected")
         self._ftlib = None
class CollectiveCommunicator(object):
    def __init__(self, service_name=None):
        if _FTLIB_INSTALLED:
            connection_try_num = 0
            while True:
                try:
                    peer_list = list(self._get_peer_set(service_name))
                except Exception:
                    if (connection_try_num * 5 >
                            _FTLIB_CONSENSUS_CONNECTION_TIMEOUT_SECS):
                        logger.error(
                            "Cannot connect to FTLib consensus service in %s "
                            "seconds",
                            str(_FTLIB_CONSENSUS_CONNECTION_TIMEOUT_SECS),
                        )
                        self._ftlib = None
                        return
                    # sleep for 5s and try again
                    logger.info("Cannot connect to FTLib consensus service, "
                                "trying again.")
                    connection_try_num += 1
                    time.sleep(5)
                else:
                    break

            self._ftlib = BasicFTLib(
                consensus="gossip",
                commlib="pytorch",
                consensus_init_kwargs={
                    "known_addr_list": peer_list,
                    "custom_bind_addr":
                    socket.gethostbyname(socket.gethostname()),
                },
            )
            while peer_list and not self._ftlib.consensus_joined():
                logger.warning("Retry building consensus...")
                self._ftlib.manual_join(
                    known_addr_list=list(self._get_peer_set(service_name)))
        else:
            logger.warning(
                "FTLib is not installed. The CollectiveCommunicator "
                "may not work as expected")
            self._ftlib = None

    def tf_allreduce(self, grads, op="MEAN"):
        if grads is None:
            logger.error("Grads is required for tf_allreduce operation")
            return CollectiveCommunicatorStatus.FAILED, grads
        # convert tf.Tensor to numpy
        numpy_data = [g.numpy() for g in grads]
        return self.allreduce(numpy_data, op)

    def allreduce(self, data, op="MEAN"):
        if data is None:
            logger.error("Data is required for allreduce operation")
            return CollectiveCommunicatorStatus.FAILED, data
        if op not in _SUPPORTED_ALLREDUCE_OPS:
            logger.error(
                "%s is not in list of supported allreduce operations: %s" %
                (op, _SUPPORTED_ALLREDUCE_OPS))
            return CollectiveCommunicatorStatus.FAILED, data
        if self._ftlib is not None:
            status, res = self._ftlib.wait_gradients_ready(params=data)
            if (status == FTCollectiveStatus.SUCCESS
                    and res == CommLibStatus.SUCCESS
                    or status == FTCollectiveStatus.NO_NEED):
                return CollectiveCommunicatorStatus.SUCCEEDED, data
            else:
                return CollectiveCommunicatorStatus.FAILED, data
        else:
            logger.warning(_FTLIB_UNINSTALLED_DEFAULT_STATUS_MESSAGE)
            return CollectiveCommunicatorStatus.SUCCEEDED, data

    def tf_broadcast(self, params, src_rank):
        for p in params:
            data = p.numpy()
            status, data = self.broadcast(p.numpy(), src_rank)
            if status == CollectiveCommunicatorStatus.SUCCEEDED:
                p.assign(data)
            else:
                return status
        return CollectiveCommunicatorStatus.SUCCEEDED

    def broadcast(self, data, src_rank):
        if self._ftlib is not None:
            status, _ = self._ftlib.broadcast(data, src_rank)
            if status == FTCollectiveStatus.SUCCESS:
                return CollectiveCommunicatorStatus.SUCCEEDED, data
            else:
                return CollectiveCommunicatorStatus.FAILED, data
        else:
            logger.warning(_FTLIB_UNINSTALLED_DEFAULT_STATUS_MESSAGE)
            return CollectiveCommunicatorStatus.SUCCEEDED, data

    def barrier(self):
        if self._ftlib is not None:
            status, _ = self._ftlib.barrier()
            if status == FTCollectiveStatus.SUCCESS:
                return CollectiveCommunicatorStatus.SUCCEEDED
            else:
                return CollectiveCommunicatorStatus.FAILED
        else:
            logger.warning(_FTLIB_UNINSTALLED_DEFAULT_STATUS_MESSAGE)
            return CollectiveCommunicatorStatus.SUCCEEDED

    def is_initialized(self):
        """This will be `False` under three occasions:
           * New workers report joining in
           * Collective-communication operations fail or time out
           * Liveness probe fails for existing workers
        """
        if self._ftlib is not None:
            return self._ftlib.initialized
        else:
            return True

    def _get_peer_set(self, svc_name):
        if svc_name is None:
            return None
        my_ip = socket.gethostbyname(socket.gethostname())
        temp_set = socket.getaddrinfo(svc_name, 0, proto=socket.IPPROTO_TCP)
        peer_set = {peer[-1][0] for peer in temp_set if peer[-1][0] != my_ip}
        return peer_set
예제 #7
0
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output


if __name__ == "__main__":
    logging.info("start!")

    epochs = 1

    # initialize the fault-tolerant library with consensus
    # and framework options
    ftlib = BasicFTLib(consensus="shared_storage", commlib="pytorch")

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    model = Net().to(device)
    optimizer = optim.Adadelta(model.parameters(), lr=1.0)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.7)

    for epoch in range(1, epochs + 1):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            time.sleep(0.5)
예제 #8
0
if __name__ == "__main__":

    args = parser.parse_args()

    logging.info("start!")
    logging.info("joining: {}".format(args.svc_name))

    epochs = 1

    # initialize the fault-tolerant library with consensus
    # and framework options
    ftlib = BasicFTLib(
        consensus="gossip",
        commlib="pytorch",
        consensus_init_kwargs={
            "known_addr_list": list(get_peer_set(args.svc_name))
        },
    )

    a_ground_truth = np.double(1.2)
    b_ground_truth = np.double(-3.7)
    c_ground_truth = np.double(4.9)

    target_func = (
        lambda x: a_ground_truth * x * x + b_ground_truth * x + c_ground_truth)

    train_loader = torch.utils.data.DataLoader(
        SyntheticData(
            lambda x: target_func(x) + 10.0 *
            (np.double(np.random.rand()) - 0.5),
예제 #9
0
    args = parser.parse_args()
    known_addr_list = (
        args.known_nodes.split(",") if args.known_nodes != "" else []
    )

    logging.info("start!")
    logging.info("joining: {}".format(known_addr_list))

    epochs = 1

    # initialize the fault-tolerant library with consensus
    # and framework options
    ftlib = BasicFTLib(
        consensus="gossip",
        commlib="pytorch",
        consensus_init_kwargs={"known_addr_list": known_addr_list},
    )

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    model = Net().to(device)
    optimizer = optim.Adadelta(model.parameters(), lr=1.0)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.7)

    for epoch in range(1, epochs + 1):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            # move data to device (CPU or GPU)
            data, target = data.to(device), target.to(device)
            # clear gradients
예제 #10
0
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output


if __name__ == '__main__':
    logging.info("start!")

    epochs = 1

    # initialize the fault-tolerant library with consensus and framework options
    ftlib = BasicFTLib()
    ftlib.init(consensus='shared_storage', framework='pytorch')

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    model = Net().to(device)
    optimizer = optim.Adadelta(model.parameters(), lr=1.0)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.7)

    for epoch in range(1, epochs + 1):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
예제 #11
0

def dummy_update():
    logging.info("dummy update")
    time.sleep(0.5)


if __name__ == "__main__":
    logging.info("start!")

    epochs = 1
    dl = dummy_dataloader(10)

    # initialize the fault-tolerant library with consensus
    # and framework options
    ftlib = BasicFTLib(consensus="shared_storage", commlib="NCCL")

    for _ in range(epochs):
        for batch in dl:
            dummy_forward()
            dummy_backward()

            if ftlib.skip_allreduce:
                logging.info("skip allreduce")
                dummy_update()
                continue
            else:
                res = ftlib.wait_gradients_ready()
            if res == FTAllReduceStatus.NO_NEED:
                logging.critical(
                    "cannot use average_gradient when there is no need")
예제 #12
0
    time.sleep(5)


def dummy_update():
    logging.info("dummy update")
    time.sleep(0.5)


if __name__ == '__main__':
    logging.info("start!")

    epochs = 1
    dl = dummy_dataloader(10)

    # initialize the fault-tolerant library with consensus and framework options
    ftlib = BasicFTLib()
    ftlib.init(consensus='shared_storage', framework='dummy_NCCL')

    for _ in range(epochs):
        for batch in dl:
            dummy_forward()
            dummy_backward()

            if ftlib.skip_allreduce:
                logging.info("skip allreduce")
                dummy_update()
                continue
            else:
                res = ftlib.wait_weights_ready()
            if res == FTAllReduceStatus.NO_NEED:
                logging.critical(
예제 #13
0
class CollectiveCommunicator(object):
    def __init__(self, service_name=None):
        if _FTLIB_INSTALLED:
            self._ftlib = BasicFTLib(
                consensus="gossip",
                commlib="pytorch",
                consensus_init_kwargs={
                    "known_addr_list": list(self._get_peer_set(service_name)),
                    "custom_bind_addr":
                    socket.gethostbyname(socket.gethostname()),
                },
            )
            while not self._ftlib.consensus_joined():
                logger.warning("Retry building consensus...")
                self._ftlib.manual_join(
                    known_addr_list=list(self._get_peer_set(service_name)))
        else:
            logger.warning(
                "FTLib is not installed. The CollectiveCommunicator "
                "may not work as expected")
            self._ftlib = None

    def allreduce(self, data, op="MEAN"):
        if data is None:
            logger.error("Data is required for allreduce operation")
            return CollectiveCommunicatorStatus.FAILED, data
        if op not in _SUPPORTED_ALLREDUCE_OPS:
            logger.error(
                "%s is not in list of supported allreduce operations: %s" %
                (op, _SUPPORTED_ALLREDUCE_OPS))
            return CollectiveCommunicatorStatus.FAILED, data
        if self._ftlib is not None:
            res = self._ftlib.wait_gradients_ready(data)
            if res == FTAllReduceStatus.SUCCESS:
                return CollectiveCommunicatorStatus.SUCCEEDED, data
            else:
                return CollectiveCommunicatorStatus.FAILED, data
        else:
            logger.warning(_FTLIB_UNINSTALLED_DEFAULT_STATUS_MESSAGE)
            return CollectiveCommunicatorStatus.SUCCEEDED, data

    def broadcast(self, data, src_rank):
        if self._ftlib is not None:
            res = self._ftlib.broadcast(data, src_rank)
            if res == FTAllReduceStatus.SUCCESS:
                return CollectiveCommunicatorStatus.SUCCEEDED, data
            else:
                return CollectiveCommunicatorStatus.FAILED, data
        else:
            logger.warning(_FTLIB_UNINSTALLED_DEFAULT_STATUS_MESSAGE)
            return CollectiveCommunicatorStatus.SUCCEEDED, data

    def barrier(self):
        if self._ftlib is not None:
            res = self._ftlib.barrier()
            if res == FTAllReduceStatus.SUCCESS:
                return CollectiveCommunicatorStatus.SUCCEEDED
            else:
                return CollectiveCommunicatorStatus.FAILED
        else:
            logger.warning(_FTLIB_UNINSTALLED_DEFAULT_STATUS_MESSAGE)
            return CollectiveCommunicatorStatus.SUCCEEDED

    def is_initialized(self):
        """This will be `False` under three occasions:
           * New workers report joining in
           * Collective-communication operations fail or time out
           * Liveness probe fails for existing workers
        """
        if self._ftlib is not None:
            return self._ftlib.initialized
        else:
            return True

    def _get_peer_set(self, svc_name):
        if svc_name is None:
            return None
        my_ip = socket.gethostbyname(socket.gethostname())
        temp_set = socket.getaddrinfo(svc_name, 0, proto=socket.IPPROTO_TCP)
        peer_set = {peer[-1][0] for peer in temp_set if peer[-1][0] != my_ip}
        return peer_set