def _evaluate(self, dataloader): if self._trainer is None: raise Exception("Must call fit first") if inspect.isclass(self._loss) and issubclass(self._loss, TLoss): # it is the loss class criterion = self._loss() elif isinstance(self._loss, TLoss): # it is the loss instance criterion = self._loss elif callable(self._loss): # it ts the loss create function criterion = self._loss({}) model = self.get_model() model.eval() metric_meters = AverageMeterCollection() with torch.no_grad(): for batch_idx, batch in enumerate(dataloader): batch_info = {"batch_idx": batch_idx} # unpack features into list to support multiple inputs model *features, target = batch output = model(*features) loss = criterion(output, target) num_samples = target.size(0) metrics = {"val_loss": loss.item(), "num_samples": num_samples} metric_meters.update(metrics) return metric_meters.summary()
def validate(self, val_iterator, info): """Runs one standard validation pass over the val_iterator. This will call ``model.eval()`` and ``torch.no_grad`` when iterating over the validation dataloader. If overriding this method, you can access model, criterion via ``self.model`` and ``self.criterion``. You also do not need to call ``validate_batch`` if overriding this method. Args: val_iterator (iter): Iterable constructed from the validation dataloader. info: (dict): Dictionary for information to be used for custom validation operations. Returns: A dict of metrics from the evaluation. By default, returns "val_accuracy" and "val_loss" which is computed by aggregating "loss" and "correct" values from ``validate_batch`` and dividing it by the sum of ``num_samples`` from all calls to ``self.validate_batch``. """ metric_meters = AverageMeterCollection() # switch to evaluate mode self.model.eval() with torch.no_grad(): for batch_idx, batch in enumerate(val_iterator): batch_info = {"batch_idx": batch_idx} batch_info.update(info) metrics = self.validate_batch(batch, batch_info) metric_meters.update(metrics, n=metrics.pop(NUM_SAMPLES, 1)) return metric_meters.summary()
def train_epoch(self, iterator, info): meter_collection = AverageMeterCollection() iter_tput = [] model = self.model # for batch_idx,batch in enumerate(iterator): for step, (input_nodes, seeds, blocks) in enumerate(iterator): tic_step = time.time() # do some train optimizer = self.optimizer device = 0 if self.use_gpu: blocks = [block.int().to(device) for block in blocks] batch_inputs = blocks[0].srcdata["features"] batch_labels = blocks[-1].dstdata["labels"] batch_pred = model(blocks, batch_inputs) loss = F.nll_loss(batch_pred, batch_labels) optimizer.zero_grad() loss.backward() optimizer.step() iter_tput.append(len(seeds) / (time.time() - tic_step)) if step % 20 == 0: acc = compute_acc(batch_pred, batch_labels) gpu_mem_alloc = torch.cuda.max_memory_allocated( ) / 1000000 if torch.cuda.is_available() else 0 print("Epoch {:05d} | Step {:05d} | Loss {:.4f} | " "Train Acc {:.4f} | Speed (samples/sec) {:.4f} | GPU " "{:.1f} MB".format(info["epoch_idx"] + 1, step, loss.item(), acc.item(), np.mean(iter_tput[3:]), gpu_mem_alloc)) status = meter_collection.summary() return status
def evaluate(self, df): super(TorchEstimator, self).evaluate(df) if self._trainer is None: raise Exception("Must call fit first") pdf = df.toPandas() dataset = PandasDataset(pdf, self._feature_columns, self._feature_shapes, self._feature_types, self._label_column, self._label_type) dataloader = torch.utils.data.DataLoader(dataset, self._batch_size, shuffle=self._shuffle) if inspect.isclass(self._loss) and issubclass(self._loss, TLoss): # it is the loss class criterion = self._loss() elif isinstance(self._loss, TLoss): # it is the loss instance criterion = self._loss elif callable(self._loss): # it ts the loss create function criterion = self._loss({}) model = self.get_model() model.eval() metric_meters = AverageMeterCollection() with torch.no_grad(): for batch_idx, batch in enumerate(dataloader): batch_info = {"batch_idx": batch_idx} # unpack features into list to support multiple inputs model *features, target = batch output = model(*features) loss = criterion(output, target) _, predicted = torch.max(output.data, 1) num_correct = (predicted == target).sum().item() num_samples = target.size(0) metrics = { "val_loss": loss.item(), "val_accuracy": num_correct / num_samples, "num_samples": num_samples } metric_meters.update(metrics) return metric_meters.summary()
def validate(self, val_iterator, info): self.model.zero_grad() self.model.eval() torch.set_grad_enabled(False) model = self.get_model() if self.is_function_implemented("on_validation_epoch_start", model): model.on_validation_epoch_start() val_outputs = [] for batch_idx, batch in enumerate(val_iterator): batch_info = {"batch_idx": batch_idx} batch_info.update(info) batch_output = self.validate_batch(batch, batch_info) if batch_output is not None: val_outputs.append(batch_output) processed_outputs = None if self.is_overridden("validation_epoch_end", model): raw_outputs = [vo["raw_output"] for vo in val_outputs] processed_outputs = model.training_epoch_end(raw_outputs) if processed_outputs is not None: if isinstance(processed_outputs, torch.Tensor): return_output = {"val_loss": processed_outputs} elif isinstance(processed_outputs, Result): raise ValueError("Result objects are not supported. Please " "return a dictionary instead.") elif isinstance(processed_outputs, dict): return_output = processed_outputs else: raise TypeError("validation_epoch_end returned an invalid " "type. It must return a Tensor, Result, " "or dict.") else: # User did not override training_epoch_end assert isinstance(val_outputs, list) # Use AverageMeterCollection util to reduce results. meter_collection = AverageMeterCollection() for v in val_outputs: num_samples = v.pop(NUM_SAMPLES, 1) raw_output = v["raw_output"] if isinstance(raw_output, dict): meter_collection.update(raw_output, num_samples) elif isinstance(raw_output, torch.Tensor): meter_collection.update({ "val_loss": raw_output.item() }, num_samples) return_output = meter_collection.summary() if self.is_function_implemented("on_validation_epoch_end", model): model.on_validation_epoch_end() # Set back to True so training will work. torch.set_grad_enabled(True) return return_output
def validate(self, val_iterator, info=None): """Runs one standard validation pass over the val_iterator. This will call ``model.eval()`` and ``torch.no_grad`` when iterating over the validation dataloader. You also do not need to call ``validate_batch`` if overriding this method. Args: val_iterator (iter): Iterable constructed from the validation dataloader. info: (Optional[dict]): Dictionary for information to be used for custom validation operations. Returns: A dict of metrics from the evaluation. By default, returns "val_accuracy" and "val_loss" which is computed by aggregating "loss" and "correct" values from ``validate_batch`` and dividing it by the sum of ``num_samples`` from all calls to ``self.validate_batch``. """ if not hasattr(self, "model"): raise RuntimeError("Either set self.model in setup function or " "override this method to implement a custom " "validation loop.") info = info or {} model = self.model metric_meters = AverageMeterCollection() # switch to evaluate mode model.eval() with torch.no_grad(): for batch_idx, batch in enumerate(val_iterator): batch_info = {"batch_idx": batch_idx} batch_info.update(info) metrics = self.validate_batch(batch, batch_info) metric_meters.update(metrics, n=metrics.pop(NUM_SAMPLES, 1)) return metric_meters.summary()
def train_epoch(self, iterator, info): """Runs one standard training pass over the training dataloader. By default, this method will iterate over the given iterator and call ``self.train_batch`` over each batch. If ``scheduler_step_freq`` is set, this default method will also step the scheduler accordingly. You do not need to call ``train_batch`` in this method if you plan to implement a custom optimization/training routine here. You may find ``ray.util.sgd.utils.AverageMeterCollection`` useful when overriding this method. See example below: .. code-block:: python def train_epoch(self, ...): meter_collection = AverageMeterCollection() self.model.train() for batch in iterator: # do some processing metrics = {"metric_1": 1, "metric_2": 3} # dict of metrics # This keeps track of all metrics across multiple batches meter_collection.update(metrics, n=len(batch)) # Returns stats of the meters. stats = meter_collection.summary() return stats Args: iterator (iter): Iterator over the training data for the entire epoch. This iterator is expected to be entirely consumed. info (dict): Dictionary for information to be used for custom training operations. Returns: A dict of metrics from training. """ if not hasattr(self, "model"): raise RuntimeError("Either set self.model in setup function or " "override this method to implement a custom " "training loop.") model = self.model scheduler = None if hasattr(self, "scheduler"): scheduler = self.scheduler if self.use_tqdm and self.world_rank == 0: desc = "" if info is not None and "epoch_idx" in info: if "num_epochs" in info: desc = f"{info['epoch_idx'] + 1}/{info['num_epochs']}e" else: desc = f"{info['epoch_idx'] + 1}e" # TODO: Implement len for Dataset? total = info[NUM_STEPS] if total is None: if hasattr(iterator, "__len__"): total = len(iterator) _progress_bar = tqdm(total=total, desc=desc, unit="batch", leave=False) metric_meters = AverageMeterCollection() model.train() for batch_idx, batch in enumerate(iterator): batch_info = { "batch_idx": batch_idx, "global_step": self.global_step } batch_info.update(info) metrics = self.train_batch(batch, batch_info=batch_info) if self.use_tqdm and self.world_rank == 0: _progress_bar.n = batch_idx + 1 postfix = {} if "train_loss" in metrics: postfix.update(loss=metrics["train_loss"]) _progress_bar.set_postfix(postfix) if scheduler and self.scheduler_step_freq == SCHEDULER_STEP_BATCH: scheduler.step() metric_meters.update(metrics, n=metrics.pop(NUM_SAMPLES, 1)) self.global_step += 1 if scheduler and self.scheduler_step_freq == SCHEDULER_STEP_EPOCH: scheduler.step() return metric_meters.summary()
def train_epoch(self, iterator, info): model = self.get_model() # Enable train mode. self.model.train() # Enable gradients. torch.set_grad_enabled(True) if self.is_function_implemented("on_train_epoch_start", model): model.on_train_epoch_start() if self.use_tqdm and self.world_rank == 0: desc = "" if info is not None and "epoch_idx" in info: if "num_epochs" in info: desc = f"{info['epoch_idx'] + 1}/{info['num_epochs']}e" else: desc = f"{info['epoch_idx'] + 1}e" # TODO: Implement len for Dataset? total = info[NUM_STEPS] if total is None: if hasattr(iterator, "__len__"): total = len(iterator) _progress_bar = tqdm(total=total, desc=desc, unit="batch", leave=False) # Output for each batch. epoch_outputs = [] for batch_idx, batch in enumerate(iterator): batch_info = { "batch_idx": batch_idx, "global_step": self.global_step } batch_info.update(info) batch_output = self.train_batch(batch, batch_info=batch_info) # batch output for each optimizer. epoch_outputs.append(batch_output) should_stop = batch_output["signal"] == -1 if self.use_tqdm and self.world_rank == 0: _progress_bar.n = batch_idx + 1 postfix = {} if "training_loss" in batch_output: postfix.update(loss=batch_output["training_loss"]) _progress_bar.set_postfix(postfix) for s_dict, scheduler in zip(self.scheduler_dicts, self.schedulers): if s_dict["interval"] == SCHEDULER_STEP_BATCH: scheduler.step() self.global_step += 1 if should_stop: break processed_outputs = None if is_overridden("training_epoch_end", model): raw_outputs = [eo["raw_output"] for eo in epoch_outputs] processed_outputs = model.training_epoch_end(raw_outputs) if processed_outputs is not None: if isinstance(processed_outputs, torch.Tensor): return_output = {"train_loss": processed_outputs} elif isinstance(processed_outputs, Result): raise ValueError("Result objects are not supported. Please " "return a dictionary instead.") elif isinstance(processed_outputs, dict): return_output = processed_outputs else: raise TypeError("training_epoch_end returned an invalid " "type. It must return a Tensor, Result, " "or dict.") else: # User did not override training_epoch_end assert isinstance(epoch_outputs, list) # Use AverageMeterCollection util to reduce results. meter_collection = AverageMeterCollection() for o in epoch_outputs: num_samples = o.pop(NUM_SAMPLES, 1) raw_output = o["raw_output"] if isinstance(raw_output, dict): meter_collection.update(raw_output, num_samples) elif isinstance(raw_output, torch.Tensor): meter_collection.update({"train_loss": o["training_loss"]}, num_samples) return_output = meter_collection.summary() if self.is_function_implemented("on_train_epoch_end", model): model.on_train_epoch_end( [eo.get("raw_output") for eo in epoch_outputs]) for s_dict, scheduler in zip(self.scheduler_dicts, self.schedulers): if s_dict["interval"] == SCHEDULER_STEP_EPOCH: scheduler.step() return return_output
def train_epoch(self, iterator, info): """Runs one standard training pass over the training dataloader. By default, this method will iterate over the given iterator and call ``self.train_batch`` over each batch. If ``scheduler_step_freq`` is set, this default method will also step the scheduler accordingly. You do not need to call ``train_batch`` in this method if you plan to implement a custom optimization/training routine here. You may find ``ray.util.sgd.utils.AverageMeterCollection`` useful when overriding this method. See example below: .. code-block:: python def train_epoch(self, ...): meter_collection = AverageMeterCollection() self.model.train() for batch in iterator: # do some processing metrics = {"metric_1": 1, "metric_2": 3} # dict of metrics # This keeps track of all metrics across multiple batches meter_collection.update(metrics, n=len(batch)) # Returns stats of the meters. stats = meter_collection.summary() return stats Args: iterator (iter): Iterator over the training data for the entire epoch. This iterator is expected to be entirely consumed. info (dict): Dictionary for information to be used for custom training operations. Returns: A dict of metrics from training. """ for r in self.reporters: r.on_epoch_begin(info, self) metric_meters = AverageMeterCollection() self.model.train() for batch_idx, batch in enumerate(iterator): batch_info = { "batch_idx": batch_idx, "global_step": self.global_step } batch_info.update(info) metrics = self.train_batch(batch, batch_info=batch_info) for r in self.reporters: r.on_batch_end(batch_info, metrics, self) if self.scheduler and batch_info.get( SCHEDULER_STEP) == SCHEDULER_STEP_BATCH: self.scheduler.step() metric_meters.update(metrics, n=metrics.pop(NUM_SAMPLES, 1)) self.global_step += 1 if self.scheduler and info.get(SCHEDULER_STEP) == SCHEDULER_STEP_EPOCH: self.scheduler.step() return metric_meters.summary()
def validate(self, validation_loader, info): meter_collection = AverageMeterCollection() model = self.model n_layers = self.config["n_layers"] n_hidden = self.config["n_hidden"] n_heads = self.config["n_heads"] batch_size = self.config["batch_size"] num_workers = self.config["sampling_num_workers"] g = self.g train_nid = self.train_nid val_nid = self.val_nid test_nid = self.test_nid device = 0 model.eval() with torch.no_grad(): x = g.ndata["features"] for i, layer in enumerate(self.convs): if i < n_layers - 1: y = torch.zeros( g.number_of_nodes(), n_hidden * n_heads if i != len(self.convs) - 1 else self.n_classes) else: y = torch.zeros( g.number_of_nodes(), n_hidden if i != len(self.convs) - 1 else self.n_classes) sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) collator = NodeCollator(g, torch.arange(g.number_of_nodes()), sampler) dataloader = DataLoader(collator.dataset, collate_fn=collator.collate, batch_size=batch_size, shuffle=False, drop_last=False, num_workers=num_workers) for input_nodes, output_nodes, blocks in dataloader: block = blocks[0] # print("block:",block) block = block.int().to(device) h = x[input_nodes].to(device) h_dst = x[output_nodes].to(device) if i != len(self.convs) - 1: h = layer(block, (h, h_dst)).flatten(1) else: h = layer(block, (h, h_dst)).mean(1) h = h.log_softmax(dim=-1) y[output_nodes] = h.cpu() x = y pred = y labels = g.ndata["labels"] _, val_acc, test_acc = compute_acc(pred[train_nid], labels[ train_nid]), compute_acc(pred[val_nid], labels[val_nid]), \ compute_acc(pred[test_nid], labels[test_nid]) metrics = { "num_samples": pred.size(0), "val_acc": val_acc.item(), "test_acc": test_acc.item() } meter_collection.update(metrics, n=metrics.pop("num_samples", 1)) status = meter_collection.summary() return status