def __init__(self, model_creator, data_creator, optimizer_creator, config=None, batch_size=16): """Initializes the runner. Args: model_creator (dict -> torch.nn.Module): see pytorch_trainer.py. data_creator (dict -> Dataset, Dataset): see pytorch_trainer.py. optimizer_creator (torch.nn.Module, dict -> loss, optimizer): see pytorch_trainer.py. config (dict): see pytorch_trainer.py. batch_size (int): see pytorch_trainer.py. """ self.model_creator = model_creator self.data_creator = data_creator self.optimizer_creator = optimizer_creator self.config = {} if config is None else config self.batch_size = batch_size self.verbose = True self.epoch = 0 self._timers = { k: utils.TimerStat(window_size=1) for k in [ "setup_proc", "setup_model", "get_state", "set_state", "validation", "training" ] }
def __init__(self, model_creator, data_creator, optimizer_creator, loss_creator, train_function=None, validation_function=None, config=None, dataloader_config=None, batch_size=16): """Initializes the runner. Args: model_creator (dict -> torch.nn.Module): see pytorch_trainer.py data_creator (int, dict -> Dataset, Dataset): see pytorch_trainer.py. optimizer_creator (torch.nn.Module, dict -> loss, optimizer): see pytorch_trainer.py. loss_creator (dict -> loss | Loss class): see pytorch_trainer.py. train_function: see pytorch_trainer.py validation_function: see pytorch_trainer.py config (dict): see pytorch_trainer.py. dataloader_config (dict): See pytorch_trainer.py. batch_size (int): see pytorch_trainer.py. """ self.model_creator = model_creator self.data_creator = data_creator self.optimizer_creator = optimizer_creator self.loss_creator = loss_creator self.config = {} if config is None else config self.dataloader_config = { "num_workers": 2, "pin_memory": True } if dataloader_config is None else dataloader_config self.train_function = train_function or pytorch_utils.train self.validation_function = (validation_function or pytorch_utils.validate) self.batch_size = batch_size self.verbose = True self.epoch = 0 self._timers = { k: utils.TimerStat(window_size=1) for k in [ "setup_proc", "setup_model", "get_state", "set_state", "validation", "training" ] } self.models = None self.optimizers = None self.criterion = None self.train_loader = None self.validation_loader = None
def train(train_iterator, model, criterion, optimizer): """Runs 1 training epoch""" batch_time = utils.AverageMeter() data_time = utils.AverageMeter() losses = utils.AverageMeter() timers = {k: utils.TimerStat() for k in ["d2h", "fwd", "grad", "apply"]} # switch to train mode model.train() end = time.time() for i, (features, target) in enumerate(train_iterator): # measure data loading time data_time.update(time.time() - end) # Create non_blocking tensors for distributed training with timers["d2h"]: if torch.cuda.is_available(): features = features.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # compute output with timers["fwd"]: output = model(features) loss = criterion(output, target) # measure accuracy and record loss losses.update(loss.item(), features.size(0)) with timers["grad"]: # compute gradients in a backward pass optimizer.zero_grad() loss.backward() with timers["apply"]: # Call step of optimizer to update model params optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() stats = { "batch_time": batch_time.avg, "batch_processed": losses.count, "train_loss": losses.avg, "data_time": data_time.avg, } stats.update({k: t.mean for k, t in timers.items()}) return stats
def __init__(self, model_creator, data_creator, optimizer_creator, loss_creator, scheduler_creator=None, train_function=None, validation_function=None, config=None, dataloader_config=None, batch_size=16, use_fp16=False, apex_args=None, scheduler_step_freq="batch"): self.model_creator = model_creator self.data_creator = data_creator self.optimizer_creator = optimizer_creator self.loss_creator = loss_creator self.scheduler_creator = scheduler_creator self.config = {} if config is None else config self.dataloader_config = { "num_workers": 2 } if dataloader_config is None else dataloader_config self.train_function = train_function or pytorch_utils.train self.validation_function = (validation_function or pytorch_utils.validate) self.batch_size = batch_size self.verbose = True self.epoch = 0 self._timers = { k: utils.TimerStat(window_size=1) for k in [ "setup_proc", "setup_model", "get_state", "set_state", "validation", "training" ] } self.models = None self.optimizers = None self.criterion = None self.schedulers = None self.train_loader = None self.validation_loader = None self.use_fp16 = use_fp16 self.apex_args = apex_args or {} if use_fp16 and not amp: raise ImportError( "Please install apex from " "https://www.github.com/nvidia/apex to use fp16 training.") self.scheduler_step_freq = scheduler_step_freq
def __init__(self, model_creator, data_creator, optimizer_creator, loss_creator, train_function=None, validation_function=None, initialization_hook=None, config=None, dataloader_config=None, num_replicas=1, use_gpu=False, batch_size=16, backend="auto"): # TODO: add support for mixed precision if num_replicas > 1 and not dist.is_available(): raise ValueError( ("Distributed PyTorch is not supported on macOS. " "To run without distributed PyTorch, set 'num_replicas=1'. " "For more information, see " "https://github.com/pytorch/examples/issues/467.")) self.model_creator = model_creator self.data_creator = data_creator self.train_function = train_function self.optimizer_creator = optimizer_creator self.loss_creator = loss_creator self.validation_function = validation_function self.initialization_hook = initialization_hook self.config = {} if config is None else config self.dataloader_config = dataloader_config self.optimizer_timer = utils.TimerStat(window_size=1) if backend == "auto": backend = "nccl" if use_gpu else "gloo" logger.info("Using {} as backend.".format(backend)) self.backend = backend self.use_gpu = use_gpu self.batch_size = batch_size self.max_replicas = num_replicas self.temp_dir = tempfile.mkdtemp(prefix="raysgd") self._num_failures = 0 self._last_resize = float("-inf") self._start_workers(self.max_replicas)
def __init__(self, model_creator, data_creator, optimizer_creator, loss_creator, scheduler_creator=None, train_function=None, validation_function=None, initialization_hook=None, config=None, dataloader_config=None, num_replicas=1, use_gpu=False, batch_size=16, backend="auto", use_fp16=False, apex_args=None, scheduler_step_freq="batch"): if num_replicas > 1 and not dist.is_available(): raise ValueError( ("Distributed PyTorch is not supported on macOS. " "To run without distributed PyTorch, set 'num_replicas=1'. " "For more information, see " "https://github.com/pytorch/examples/issues/467.")) self.model_creator = model_creator self.data_creator = data_creator self.train_function = train_function self.optimizer_creator = optimizer_creator self.loss_creator = loss_creator self.scheduler_creator = scheduler_creator self.validation_function = validation_function self.initialization_hook = initialization_hook self.config = {} if config is None else config self.dataloader_config = dataloader_config self.optimizer_timer = utils.TimerStat(window_size=1) if backend == "auto": backend = "nccl" if use_gpu else "gloo" logger.info("Using {} as backend.".format(backend)) self.backend = backend self.use_gpu = use_gpu self.batch_size = batch_size self.max_replicas = num_replicas self.use_fp16 = use_fp16 if apex_args and not isinstance(apex_args, dict): raise ValueError("apex_args needs to be a dict object.") self.apex_args = apex_args self.temp_dir = tempfile.mkdtemp(prefix="raysgd") self._num_failures = 0 self._last_resize = float("-inf") if scheduler_step_freq and ( scheduler_step_freq not in pytorch_utils.VALID_SCHEDULER_STEP): raise ValueError( "Scheduler step freq must be in {}. Got {}".format( pytorch_utils.VALID_SCHEDULER_STEP, scheduler_step_freq)) self.scheduler_step_freq = scheduler_step_freq self._start_workers(self.max_replicas)
def __init__(self, model_creator, data_creator, optimizer_creator, loss_creator, train_function=None, validation_function=None, initialization_hook=None, config=None, num_replicas=1, use_gpu=False, batch_size=16, backend="auto"): """Sets up the PyTorch trainer. Args: model_creator (dict -> torch.nn.Module): creates the model using the config. data_creator (int, dict -> DataLoader, DataLoader): Function that takes in (batch_size, config) and returns two Torch DataLoader objects. optimizer_creator (torch.nn.Module, dict -> optimizer): creates the loss and optimizer using the model and the config. loss_creator (dict -> loss): Creates the loss function/criterion using the config. train_function: Trains a model for a epoch. This takes in ( model, train_dataloader, criterion, optimizer, config), and returns a dict of training stats. validation_function: Runs validation. This takes in ( model, val_dataloader, criterion, config) and returns a dict of validation stats. config (dict): configuration passed to "model_creator", "data_creator", "optimizer_creator", and "loss_creator". num_replicas (int): the number of workers used in distributed training. use_gpu (bool): Sets resource allocation for workers to 1 GPU if true. batch_size (int): batch size for an update. backend (string): backend used by distributed PyTorch. """ # TODO: add support for mixed precision # TODO: add support for callbacks if num_replicas > 1 and not dist.is_available(): raise ValueError( ("Distributed PyTorch is not supported on macOS. " "To run without distributed PyTorch, set 'num_replicas=1'. " "For more information, see " "https://github.com/pytorch/examples/issues/467.")) self.model_creator = model_creator self.train_function = train_function self.validation_function = validation_function self.config = {} if config is None else config self.optimizer_timer = utils.TimerStat(window_size=1) if backend == "auto": backend = "nccl" if use_gpu else "gloo" logger.info("Using {} as backend.".format(backend)) if num_replicas == 1: # Generate actor class Runner = ray.remote( num_cpus=1, num_gpus=int(use_gpu))(PyTorchRunner) # Start workers self.workers = [ Runner.remote( model_creator, data_creator, optimizer_creator, loss_creator, train_function=train_function, validation_function=validation_function, config=self.config, batch_size=batch_size) ] if initialization_hook: self.apply_all_workers(initialization_hook) # Get setup tasks in order to throw errors on failure ray.get(self.workers[0].setup.remote()) else: # Generate actor class Runner = ray.remote( num_cpus=1, num_gpus=int(use_gpu))(DistributedPyTorchRunner) # Compute batch size per replica batch_size_per_replica = batch_size // num_replicas if batch_size % num_replicas > 0: new_batch_size = batch_size_per_replica * num_replicas logger.warning( ("Changing batch size from {old_batch_size} to " "{new_batch_size} to evenly distribute batches across " "{num_replicas} replicas.").format( old_batch_size=batch_size, new_batch_size=new_batch_size, num_replicas=num_replicas)) # Start workers self.workers = [ Runner.remote( model_creator, data_creator, optimizer_creator, loss_creator, backend=backend, train_function=train_function, validation_function=validation_function, config=self.config, batch_size=batch_size_per_replica) for i in range(num_replicas) ] if initialization_hook: self.apply_all_workers(initialization_hook) # Compute URL for initializing distributed PyTorch ip = ray.get(self.workers[0].get_node_ip.remote()) port = ray.get(self.workers[0].find_free_port.remote()) address = "tcp://{ip}:{port}".format(ip=ip, port=port) # Get setup tasks in order to throw errors on failure ray.get([ worker.setup.remote(address, i, len(self.workers)) for i, worker in enumerate(self.workers) ])
def __init__(self, model_creator, data_creator, optimizer_creator=pytorch_utils.sgd_mse_optimizer, config=None, num_replicas=1, use_gpu=False, batch_size=16, backend="auto"): """Sets up the PyTorch trainer. Args: model_creator (dict -> torch.nn.Module): creates the model using the config. data_creator (dict -> Dataset, Dataset): creates the training and validation data sets using the config. optimizer_creator (torch.nn.Module, dict -> loss, optimizer): creates the loss and optimizer using the model and the config. config (dict): configuration passed to 'model_creator', 'data_creator', and 'optimizer_creator'. num_replicas (int): the number of workers used in distributed training. use_gpu (bool): Sets resource allocation for workers to 1 GPU if true. batch_size (int): batch size for an update. backend (string): backend used by distributed PyTorch. """ # TODO: add support for mixed precision # TODO: add support for callbacks if num_replicas > 1 and not dist.is_available(): raise ValueError( ("Distributed PyTorch is not supported on macOS. " "To run without distributed PyTorch, set 'num_replicas=1'. " "For more information, see " "https://github.com/pytorch/examples/issues/467.")) self.model_creator = model_creator self.config = {} if config is None else config self.optimizer_timer = utils.TimerStat(window_size=1) if backend == "auto": backend = "nccl" if use_gpu else "gloo" logger.info("Using {} as backend.".format(backend)) if num_replicas == 1: # Generate actor class Runner = ray.remote(num_cpus=1, num_gpus=int(use_gpu))(PyTorchRunner) # Start workers self.workers = [ Runner.remote(model_creator, data_creator, optimizer_creator, self.config, batch_size) ] # Get setup tasks in order to throw errors on failure ray.get(self.workers[0].setup.remote()) else: # Geneate actor class Runner = ray.remote( num_cpus=1, num_gpus=int(use_gpu))(DistributedPyTorchRunner) # Compute batch size per replica batch_size_per_replica = batch_size // num_replicas if batch_size % num_replicas > 0: new_batch_size = batch_size_per_replica * num_replicas logger.warning( ("Changing batch size from {old_batch_size} to " "{new_batch_size} to evenly distribute batches across " "{num_replicas} replicas.").format( old_batch_size=batch_size, new_batch_size=new_batch_size, num_replicas=num_replicas)) # Start workers self.workers = [ Runner.remote(model_creator, data_creator, optimizer_creator, self.config, batch_size_per_replica, backend) for i in range(num_replicas) ] # Compute URL for initializing distributed PyTorch ip = ray.get(self.workers[0].get_node_ip.remote()) port = ray.get(self.workers[0].find_free_port.remote()) address = "tcp://{ip}:{port}".format(ip=ip, port=port) # Get setup tasks in order to throw errors on failure ray.get([ worker.setup.remote(address, i, len(self.workers)) for i, worker in enumerate(self.workers) ])
def __init__(self, model_creator, data_creator, optimizer_creator, loss_creator, train_function=None, validation_function=None, initialization_hook=None, config=None, num_replicas=1, use_gpu=False, batch_size=16, backend="auto"): """Sets up the PyTorch trainer. Args: model_creator (dict -> torch.nn.Module): creates the model using the config. data_creator (int, dict -> DataLoader, DataLoader): Function that takes in (batch_size, config) and returns two Torch DataLoader objects. optimizer_creator (torch.nn.Module, dict -> optimizer): creates the loss and optimizer using the model and the config. loss_creator (dict -> loss): Creates the loss function/criterion using the config. train_function: Trains a model for a epoch. This takes in ( model, train_dataloader, criterion, optimizer, config), and returns a dict of training stats. validation_function: Runs validation. This takes in ( model, val_dataloader, criterion, config) and returns a dict of validation stats. config (dict): configuration passed to "model_creator", "data_creator", "optimizer_creator", and "loss_creator". num_replicas (int): the number of workers used in distributed training. use_gpu (bool): Sets resource allocation for workers to 1 GPU if true. batch_size (int): batch size for an update. backend (string): backend used by distributed PyTorch. """ # TODO: add support for mixed precision # TODO: add support for callbacks if num_replicas > 1 and not dist.is_available(): raise ValueError( ("Distributed PyTorch is not supported on macOS. " "To run without distributed PyTorch, set 'num_replicas=1'. " "For more information, see " "https://github.com/pytorch/examples/issues/467.")) self.model_creator = model_creator self.data_creator = data_creator self.train_function = train_function self.optimizer_creator = optimizer_creator self.loss_creator = loss_creator self.validation_function = validation_function self.initialization_hook = initialization_hook self.config = {} if config is None else config self.optimizer_timer = utils.TimerStat(window_size=1) if backend == "auto": backend = "nccl" if use_gpu else "gloo" logger.info("Using {} as backend.".format(backend)) self.backend = backend self.use_gpu = use_gpu self.batch_size = batch_size self.max_replicas = num_replicas self.temp_dir = tempfile.mkdtemp(prefix="raysgd") self._num_failures = 0 self._last_resize = float("-inf") self._start_workers(self.max_replicas)