def train(args): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') from os import path model = Detector().to(device) train_logger, valid_logger = None, None if args.log_dir is not None: train_logger = tb.SummaryWriter(path.join(args.log_dir, 'train'), flush_secs=1) valid_logger = tb.SummaryWriter(path.join(args.log_dir, 'valid'), flush_secs=1) # Loading training & testing data. training_data = load_detection_data(TRAINING_DATA_PATH, batch_size=BATCH_SIZE) val_data, val_labels = [[data, labels] for data, labels, extra in load_detection_data( TEST_DATA_PATH, batch_size=16)][0] val_data, val_labels = val_data.to(device), val_labels.to(device) # Optimizer & Loss optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-6) loss_func = torch.nn.BCEWithLogitsLoss(reduction='none') # Logger logger = tb.SummaryWriter('/logs/train/1') # Algorithm for epoch in range(EPOCHS): print(epoch) model.train() for data, labels, extra in training_data: # Performing Prediction data = data.to(device) labels = labels.to(device) results = model(data) # Calculating Loss BCE = loss_func(results, labels) Pt = torch.exp(-BCE) focal_loss = ((1 - Pt)**2 * BCE).mean() # Updating Weights optimizer.zero_grad() focal_loss.backward() optimizer.step() # Logging Results model.eval() results = model(val_data) log(logger, val_data, val_labels, results, epoch) if epoch == 100: save_model(model, name='det1.th') if epoch == 125: save_model(model, name='det2.th') save_model(model)
def logger(self, epoch, x_train, train_loss, val_loss, k): """ Write to TensorBoard """ #Writing to be done in the first epoch print('Epoch in TensorBoard:', epoch) if epoch == 0: tb_path = './runs/' + self.model_name_save_dir print('tb_path', tb_path) self.writer['train'] = tb.SummaryWriter(log_dir=tb_path + '/train' + str(k)) self.writer['val'] = tb.SummaryWriter(log_dir=tb_path + '/val' + str(k)) sample_data = iter(self.trainloader).next()[ 0] # [batch_size X seq_length X embedding_dim] self.writer['train'].add_graph(self.model, sample_data.to(self.device)) self.writer['train'].add_text('Model:', str(self.model)) self.writer['train'].add_text('Input shape:', str(x_train.shape)) self.writer['train'].add_text('Data Preprocessing:', 'None, One-hot') self.writer['train'].add_text('Optimiser', str(self.optimizer)) self.writer['train'].add_text( 'Batch Size:', str(self.config['DATA']['BATCH_SIZE'])) self.writer['train'].add_text( 'Epochs:', str(self.config['TRAINER']['epochs'])) for measure, value in self.metrics['train'].items(): self.writer['train'].add_scalar(str('Train/' + measure), value, epoch) self.writer['train'].add_scalar('Loss', train_loss, epoch) for measure, value in self.metrics['val'].items(): self.writer['val'].add_scalar(str('Val/' + measure), value, epoch) self.writer['val'].add_scalar('Loss', val_loss, epoch)
def create_summary_writers(net, device, log_dir): train_path = log_dir / "train" val_path = log_dir / "validation" train_writer = tensorboard.SummaryWriter(train_path, flush_secs=60) val_writer = tensorboard.SummaryWriter(val_path, flush_secs=60) return train_writer, val_writer
def get_loggers(params): if on_colab: train_logger = tb.SummaryWriter( os.path.join(params.log_dir, 'train/%s' % params.run_name)) valid_logger = tb.SummaryWriter( os.path.join(params.log_dir, 'valid/%s' % params.run_name)) else: train_logger = None valid_logger = None return train_logger, valid_logger
def train(args): from os import path model = Detector() train_logger, valid_logger = None, None if args.log_dir is not None: train_logger = tb.SummaryWriter(path.join(args.log_dir, 'train'), flush_secs=1) valid_logger = tb.SummaryWriter(path.join(args.log_dir, 'valid'), flush_secs=1) """ Your code here, modify your HW3 code """ device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model = Detector().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) loss = torch.nn.BCEWithLogitsLoss() num_epoch = 50 train_data = load_detection_data('dense_data/train') valid_data = load_detection_data('dense_data/valid') global_step = 0 best_vacc = 0 best_vloss = 1 for epoch in range(num_epoch): print(epoch) model.train() acc_vals = [] loss_vals = [] for img, label, ec in train_data: img, label = img.to(device), label.to(device) logit = model(img) label = label loss_val = loss(logit, label) loss_vals.append(loss_val.item()) acc_val = accuracy(logit, label) if train_logger is not None: train_logger.add_scalar('loss', loss_val, global_step) acc_vals.append(acc_val) optimizer.zero_grad() loss_val.backward() # print(loss_val.item()) optimizer.step() global_step += 1 avg_loss = sum(loss_vals) / len(loss_vals) if (avg_loss < best_vloss): print("saving!") best_vloss = avg_loss save_model(model)
def train(args): from os import path import torch.utils.tensorboard as tb model = TCN() train_logger, valid_logger = None, None if args.log_dir is not None: train_logger = tb.SummaryWriter(path.join(args.log_dir, 'train'), flush_secs=1) valid_logger = tb.SummaryWriter(path.join(args.log_dir, 'valid'), flush_secs=1) import torch device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model = model.to(device) optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=1e-5) loss = torch.nn.CrossEntropyLoss() train_data = SpeechDataset('data/train.txt', transform=one_hot, max_len=args.sequence_length) valid_data = SpeechDataset('data/valid.txt', transform=one_hot, max_len=args.sequence_length) model.train() for iterations in range(args.iteration_num): batch = make_batch(args.batch_size, train_data) batch_data = batch[:, :, :-1] batch_data = batch_data.to(device) batch_label = batch.argmax(dim=1) batch_label = batch_label.to(device) o = model(batch_data) loss_val = loss(o, batch_label) if args.log_dir is not None: train_logger.add_scalar('train/loss', loss_val, global_step=iterations) optimizer.zero_grad() loss_val.backward() optimizer.step() save_model(model)
def train(args): device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') from os import path model = FCN().to(device) train_logger, valid_logger = None, None if args.log_dir is not None: train_logger = tb.SummaryWriter(path.join(args.log_dir, 'train'), flush_secs=1) valid_logger = tb.SummaryWriter(path.join(args.log_dir, 'valid'), flush_secs=1) # Loading the training and testing data. training_data = load_dense_data(TRAINING_DATA_PATH, batch_size = BATCH_SIZE) testing_data = load_dense_data(TEST_DATA_PATH, batch_size = BATCH_SIZE) val_data, val_labels = [ [data, labels] for data, labels in load_dense_data(TEST_DATA_PATH, batch_size = 1) ][0] val_data, val_labels = val_data.to(device), val_labels.to(device) # Optimizer & Loss optimizer = torch.optim.Adam(model.parameters(), lr = 1e-4, weight_decay = 1e-1) DENSE_WEIGHTS = (1.0 / torch.FloatTensor(DENSE_CLASS_DISTRIBUTION)).to(device) loss_func = torch.nn.CrossEntropyLoss(DENSE_WEIGHTS) converter = dense_transforms.ToTensor() result_tracker = ConfusionMatrix() for epoch in range(EPOCHS): model.train() print(epoch) # Iterates through the the batched data. for data, labels in training_data: # Adds the batch to the GPU data = data.to(device) labels = labels.long().to(device) # Determines loss based on the results of the model. results = model(data) loss = loss_func(results, labels) # Updates the parameters based on the loss and gradients. optimizer.zero_grad() loss.backward() optimizer.step() # Logging Results model.eval() results = model(val_data) result_tracker.add(results.argmax(1), val_labels) print(result_tracker.iou, result_tracker.global_accuracy) save_model(model)
def train(args): from os import path model = Detector() train_logger, valid_logger = None, None if args.log_dir is not None: train_logger = tb.SummaryWriter(path.join(args.log_dir, 'train'), flush_secs=1) valid_logger = tb.SummaryWriter(path.join(args.log_dir, 'valid'), flush_secs=1) """ Your code here, modify your HW3 code """ raise NotImplementedError('train') save_model(model)
def __init__(self, model, resume, config, iters_per_epoch, val_logger=None, train_logger=None): self.model = model self.config = config self.val_logger = val_logger self.train_logger = train_logger self.logger = logging.getLogger(self.__class__.__name__) self.do_validation = self.config['trainer']['val'] self.start_epoch = 1 self.improved = False # SETTING THE DEVICE self.device, availble_gpus = self._get_available_devices(self.config['n_gpu']) self.model = torch.nn.DataParallel(self.model, device_ids=availble_gpus) self.model.to(self.device) # CONFIGS cfg_trainer = self.config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] # OPTIMIZER trainable_params = [{'params': filter(lambda p:p.requires_grad, self.model.module.get_other_params())}, {'params': filter(lambda p:p.requires_grad, self.model.module.get_backbone_params()), 'lr': config['optimizer']['args']['lr'] / 10}] self.optimizer = get_instance(torch.optim, 'optimizer', config, trainable_params) model_params = sum([i.shape.numel() for i in list(model.parameters())]) opt_params = sum([i.shape.numel() for j in self.optimizer.param_groups for i in j['params']]) assert opt_params == model_params, 'some params are missing in the opt' self.lr_scheduler = getattr(utils.lr_scheduler, config['lr_scheduler'])(optimizer=self.optimizer, num_epochs=self.epochs, iters_per_epoch=iters_per_epoch) # MONITORING self.monitor = cfg_trainer.get('monitor', 'off') if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = -math.inf if self.mnt_mode == 'max' else math.inf self.early_stoping = cfg_trainer.get('early_stop', math.inf) # CHECKPOINTS & TENSOBOARD date_time = datetime.datetime.now().strftime('%m-%d_%H-%M') run_name = config['experim_name'] self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], run_name) helpers.dir_exists(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(self.config, handle, indent=4, sort_keys=True) writer_dir = os.path.join(cfg_trainer['log_dir'], run_name) self.writer = tensorboard.SummaryWriter(writer_dir) self.html_results = HTML(web_dir=config['trainer']['save_dir'], exp_name=config['experim_name'], save_name=config['experim_name'], config=config, resume=resume) if resume: self._resume_checkpoint(resume)
def train(epochs): print("Train start") writer = tensorboard.SummaryWriter(log_dir='./log', comment='Train loop') for ep in range(1, epochs + 1): epoch_loss, epoch_accuracy, epoch_precision = 0, 0, 0 epoch_f1, idx = 0, 0 for idx, (inp, label) in enumerate(train_loader): optimizer.zero_grad() op = model(inp) loss = criterion(op, label) loss.backward() optimizer.step() epoch_loss += loss.item() epoch_accuracy += accuracy(op, label) epoch_precision += precision(op, label) epoch_f1 += f1(op, label) writer.add_scalars( 'Training', { 'Accuracy': epoch_accuracy / idx, 'Precision': epoch_precision / idx, 'F1': epoch_f1 / idx }, ep) writer.add_scalars('Loss', {'Training': epoch_loss / idx}, ep) writer.close() torch.save(model.state_dict(), PATH) print("Done training")
def __init__( self, job_dir, num_examples, learning_rate, batch_size, epochs, num_workers, seed, ): super(PyTorchModel, self).__init__(job_dir=job_dir, seed=seed) self.num_examples = num_examples self.learning_rate = learning_rate self.batch_size = batch_size self.epochs = epochs self.summary_writer = tensorboard.SummaryWriter(log_dir=self.job_dir) self.logger = utils.setup_logger(name=__name__ + "." + self.__class__.__name__, distributed_rank=0) self.trainer = engine.Engine(self.train_step) self.evaluator = engine.Engine(self.tune_step) self._network = None self._optimizer = None self._metrics = None self.num_workers = num_workers self.device = distributed.device() self.best_state = None self.counter = 0
def main(): # Random seed initialization random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) # Define your dataloader here loader_train = None loader_eval = None writer = tensorboard.SummaryWriter( log_dir=path.join('..', 'experiment', cfg.save) ) if torch.cuda.is_available(): device = torch.device('cuda') torch.cuda.manual_seed_all(seed) else: device = torch.device('cpu') # Make a CNN net = simple.Simple() net = net.to(device) # Will be supported later... ''' writer.add_graph( net, input_to_model=torch.randn(1, 3, 64, 64).to(device), ) ''' # Set up an optimizer params = [p for p in net.parameters() if p.requires_grad] optimizer = optim.Adam(params, lr=1e-4) # Set up a learning rate scheduler scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[int(0.5 * cfg.epochs), int(0.75 * cfg.epochs)], gamma=0.5, ) def do_train(epoch: int): net.train() for batch, (x, t) in enumerate(loader_train): x = x.to(device) t = t.to(device) # Define your training loop here def do_eval(epoch: int): net.eval() for x, t in loader_eval: x = x.to(device) t = t.to(device) # Define your evaluation loop here # Outer loop for i in tqdm.trange(cfg.epochs): do_train(i + 1) do_eval(i + 1)
def perform_experiment(store_dir=None, test_model=False, test_dir=None): model = FullyConnectedNN([84, 42, 21], 4).to(device) optimizer = optim.Adagrad(model.parameters(), lr=0.0005) #, weight_decay=0.1 scheduler = { "scheduler": lambda o: optim.lr_scheduler.MultiStepLR(o, [20, 30], gamma=.1), "epoch": 35 } tensorboard = tb.SummaryWriter( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "tb_logs", "nn")) runner = NeuralNetworkRunner( model, optimizer=optimizer, tensorboard=tensorboard, loss_fn=nn.CrossEntropyLoss(weight=weight_vector)) runner.train(lr_setup=scheduler) runner.get_metrics().plot_confusion_matrix( tensorboard=tensorboard, labels=["normal", "bacteria", "virus", "covid"], tag="nn") if store_dir is not None: store_path = os.path.join(store_dir, "nn.torch") model.save(store_path) if test_model: output_path = os.path.join(test_dir, "nn.txt") runner_output_test(runner, output_path)
def main(): if __name__ == '__main__': writer = tensorboard.SummaryWriter(log_dir='./logs') device = torch.device( 'cuda:0') if torch.cuda.is_available() else torch.device('cpu') vggloss = VGGLosses(device=device).to(device) dataset = Dataset(root='dataset/Shinkai', style_transform=transform, smooth_transform=transform) dataloader = DataLoader(dataset, batch_size=16, shuffle=True) G = Generator().to(device) D = PatchDiscriminator().to(device) G.apply(weights_init) D.apply(weights_init) optimizer_G = optim.Adam(G.parameters(), lr=0.0001) #Based on paper optimizer_D = optim.Adam(D.parameters(), lr=0.0004) #Based on paper init_train(20, lr=0.1, con_weight=1.0) train(epoch=10, con_weight=1.2, gra_weight=2., col_weight=10.)
def __init__(self, max_epoch, batch_size, step, model, metric='val_kappa', current_epoch=1, optimizer=None, warmup_scheduler=None, lr_scheduler=None, weighted_sampler=None): self.callbacks = [] self.max_epoch = max_epoch self.epoch = current_epoch self.batch_size = batch_size self.step = step self.losses = {} self.args = args self.model_path = args.model_path self.model = model self.ckpt_path = args.checkpoint self.writer = tsb.SummaryWriter(self.ckpt_path) self.warmup_scheduler = warmup_scheduler self.lr_scheduler = lr_scheduler self.optimizer = optimizer self.metric = metric self.history = 0 # Record for acc self.weighted_sampler = weighted_sampler # Compile metric opt if 'loss' in metric: self.opt = np.less self.best = np.inf else: self.opt = np.greater self.best = -np.inf
def __init__( self, config, # type: ignore output_folder: str, model: nn.Module, loss_fn: nn.Module, train_loader: DataLoader, # type: ignore val_loader: DataLoader, # type: ignore test_loader: DataLoader, # type: ignore early_stopping_patience: int = 5, ) -> None: super().__init__() self.optim = Adam(model.parameters(), config["lr"]) # type: ignore self.train_loader: DataLoader = train_loader # type: ignore self.val_loader: DataLoader = val_loader # type: ignore self.model = model self.loss_fn = loss_fn self.current_epoch: int = 0 self.epochs: int = config["epochs"] self.output_folder = output_folder self.writer = tensorboard.SummaryWriter(output_folder) self.best_val_loss = float('inf') self.early_stopping_patience = early_stopping_patience self.patience = early_stopping_patience self.is_training: bool = True
def train(args): from os import path model = Planner() train_logger, valid_logger = None, None if args.log_dir is not None: train_logger = tb.SummaryWriter(path.join(args.log_dir, 'train'), flush_secs=1) """ Your code here, modify your HW1 / HW2 code """ import torch device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model = Planner().to(device) if args.continue_training: model.load_state_dict( torch.load( path.join(path.dirname(path.abspath(__file__)), 'cnn.th'))) #optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=1e-5) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) loss = torch.nn.BCEWithLogitsLoss() import inspect transform = eval( args.transform, { k: v for k, v in inspect.getmembers(dense_transforms) if inspect.isclass(v) }) train_data = load_data('2x2x1250z', transform=transform, num_workers=4) global_step = 0 for epoch in range(args.num_epoch): acc = [] losses = [] for img, label in train_data: img, label = img.to(device), label.to(device) logit = model(img) loss_val = loss(logit, label) accuracy = ((logit > 0).long() == label).detach().cpu().numpy() acc.extend(accuracy) if train_logger is not None: train_logger.add_scalar('loss', loss_val, global_step) optimizer.zero_grad() loss_val.backward() losses.append(loss_val.detach().cpu().numpy()) optimizer.step() global_step += 1 if train_logger: train_logger.add_scalar('accuracy', np.mean(acc), global_step) print('epoch %-3d \t loss = %0.3f \t acc = %0.3f' % (epoch, np.mean(losses), np.mean(acc))) save_model(model) save_model(model)
def __init__(self, save_dir: Optional[str] = None) -> None: if is_master(): from torch.utils import tensorboard self._save_dir = Path(save_dir or ".") self._save_dir.mkdir(exist_ok=True, parents=True) self.writer = tensorboard.SummaryWriter(save_dir) self.writer.add_text("exec", ' '.join(get_args()))
def restore_checkpoint(self, epoch=None): """Restores the Trainer's state using self.log_dir. Args: epoch: Epoch from which to restore the Trainer's state. If None, uses the latest available epoch. """ epoch = epoch or self._find_latest_epoch() checkpoint = torch.load(self._path(f"trainer_state_{epoch}.ckpt")) self.model.load_state_dict(checkpoint["model"]) self.optimizer.load_state_dict(checkpoint["optimizer"]) self._step = checkpoint["step"] self._epoch = checkpoint["epoch"] self._examples_processed = checkpoint["examples_processed"] self._time_taken = checkpoint["time_taken"] if self.lr_scheduler is not None: self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) # NOTE(eugenhotaj): We need to replace the SummaryWriter and ensure any # logs written after the last saved checkpoint are purged. self._summary_writer.close() self._summary_writer = tensorboard.SummaryWriter(self.log_dir, max_queue=100, purge_step=self._step)
def train(args): from os import path import torch.utils.tensorboard as tb model = TCN() train_logger, valid_logger = None, None if args.log_dir is not None: train_logger = tb.SummaryWriter(path.join(args.log_dir, 'train'), flush_secs=1) valid_logger = tb.SummaryWriter(path.join(args.log_dir, 'valid'), flush_secs=1) """ Your code here, modify your code from prior assignments Hint: SGD might need a fairly high learning rate to work well here """ raise NotImplementedError('train') save_model(model)
def _report_results(self, tensorboard_log_dir, checkpoint_path, results): step = self._get_step_from_checkpoint(checkpoint_path) with tensorboard.SummaryWriter(tensorboard_log_dir) as writer: for sampler, sampler_results in results.items(): for metric, value in sampler_results.items(): writer.add_scalar(f'{sampler}__{metric}', value, global_step=step)
def _get_writer(self): """Get writer and initialize if possible.""" if (self._writer is None and self._logdir is not None and self._global_tag is not None and self.name is not None): self._writer = tb.SummaryWriter( os.path.join(self._logdir, self.name)) return self._writer
def train(args): from os import path model = FCN() train_logger, valid_logger = None, None if args.log_dir is not None: train_logger = tb.SummaryWriter(path.join(args.log_dir, 'train'), flush_secs=1) valid_logger = tb.SummaryWriter(path.join(args.log_dir, 'valid'), flush_secs=1) """ Your code here, modify your HW1 / HW2 code Hint: Use ConfusionMatrix, ConfusionMatrix.add(logit.argmax(1), label), ConfusionMatrix.iou to compute the overall IoU, where label are the batch labels, and logit are the logits of your classifier. Hint: If you found a good data augmentation parameters for the CNN, use them here too. Use dense_transforms Hint: Use the log function below to debug and visualize your model """ save_model(model)
def __init__(self, model, data, optimizer_cls, loss_fn_cls, log_name:str): self.model = model self.data = data self.optimizer = optimizer_cls(model.parameters()) self.loss_fn = loss_fn_cls(ignore_index=self.data.tag_pad_idx) self.writer = tensorboard.SummaryWriter(log_name) self.train_global = 0 self.test_global = 0
def __init__(self, args, cfg): super(Tester, self).__init__(args, cfg) args = self.args if self.batch_size != 1: self.logger.info( "batch size in the testing mode should be set to one.") self.logger.info("setting batch size (batch-size = 1).") self.batch_size = 1 if self.seq_size != 1: self.logger.info("setting sequence size (s=1)") raise ValueError("Sequence size mus tbe equal 1 in test mode.") # create the folder for saving training checkpoints self.checkpoint_dir = self.out_dir Path(self.checkpoint_dir).mkdir(parents=True, exist_ok=True) # preapre dataset and dataloaders transform = None self.model = nets.get_model(input_shape=(self.n_channels, self.im_height_model, self.im_width_model), cfg=self.cfg, device=self.device) self.criterion = get_loss_function(self.cfg, args.device) self.has_lidar = True if self.model.lidar_feat_net is not None else False self.has_imu = True if self.model.imu_feat_net is not None else False self.test_dataset = ds.Kitti(config=self.cfg, transform=transform, ds_type='test', has_imu=self.has_imu, has_lidar=self.has_lidar) self.test_dataloader = torch.utils.data.DataLoader( self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False, worker_init_fn=worker_init_fn, collate_fn=ds.deeplio_collate) self.data_permuter = DataCombiCreater(combinations=self.combinations, device=self.device) self.tensor_writer = tensorboard.SummaryWriter(log_dir=self.runs_dir) # debugging and visualizing self.logger.print("System Training Configurations:") self.logger.print("args: {}".format(self.args)) self.logger.print(yaml.dump(self.cfg)) self.logger.print(self.test_dataset)
def train(env, agent, n_episodes: int = 1000, score_threshold: float = 32) -> list: """ Params ====== n_episodes (int): maximum number of training episodes """ scores = [] scores_window: Deque[float] = deque(maxlen=100) best_score = float("-inf") writer = tensorboard.SummaryWriter(f"runs/{int(time())}") for i_episode in range(1, n_episodes + 1): state = env.reset() score = 0 start = time() while True: action = agent.act(state) next_state, reward, done, _ = env.step(action) agent.step(state, action, reward, next_state, done) state = next_state score += np.mean(reward) if np.any(done): break time_for_episode = time() - start writer.add_scalar("train/time", time_for_episode, i_episode) scores_window.append(score) scores.append(score) window_score = np.mean(scores_window) writer.add_scalar("train/reward", score, i_episode) writer.add_scalar("train/window", window_score, i_episode) writer.add_scalar("train/memory_size", len(agent.memory), i_episode) print( f'\rEpisode {i_episode}\tAverage Score: {window_score:.2f}\tTime: {time_for_episode:.2f}', end="") if i_episode % 100 == 0: print(f'\rEpisode {i_episode}\tAverage Score: {window_score:.2f}') if window_score >= score_threshold and best_score < score_threshold: print( f'\nEnvironment solved in {i_episode:d} episodes!\tAverage Score: {window_score:.2f}' ) if window_score > best_score and window_score >= score_threshold: best_score = window_score torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pt') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pt') print(f"Best average score: {best_score}") writer.close() return scores
def initialize(parameters, action_encoder, distributed_config=None): task = parameters.get('task', 'train') if task == 'train' and _train_utils.is_leader(): savedir = _train_utils.get_save_dir(parameters) # Save parameters with open(os.path.join(savedir, 'params.json'), 'w') as fp: json.dump(parameters, fp, sort_keys=True, indent=4) Chem.disable_log('rdApp.*') batch_size = parameters.get('batch_size', 128) if distributed_config: batch_size = batch_size // distributed_config.world_size config = joint_network.JointClassificationNetworkConfiguration( action_encoder.get_num_atom_insert_locations(), action_encoder.num_insert_bond_locations, hidden_size=384, depth=parameters.get('message_depth', 5)) model = joint_network.JointClassificationNetwork(batch_size, config) model_path = parameters.get('model_path') if model_path: model.load_state_dict(torch.load(model_path, map_location='cpu')) if distributed_config: print("Creating model on GPU {0}".format( distributed_config.local_rank)) gpu_id = distributed_config.local_rank model = modules.SingleDeviceDistributedParallel( model.cuda(gpu_id), gpu_id) else: model = model.cuda() if task == 'train': model.train() else: model.eval() def save_model(ep, it): if task == 'train' and _train_utils.is_leader(): model_filename = os.path.join( savedir, "joint_model_ep_{0}_it_{1:04d}.pth".format(ep, it)) torch.save(model.state_dict(), model_filename) if _train_utils.is_leader() and (task == 'train'): from torch.utils import tensorboard summary_dir = os.path.join(savedir, 'summary') writer = tensorboard.SummaryWriter(log_dir=summary_dir) else: writer = None return model, save_model, writer
def __init__(self): self.writer = tensorboard.SummaryWriter("checkpoint/tensorboard") self.rect_color = (0,255,255) self.landmarks_color = (0,255,0) self.rect_width = 3 self.landmarks_radius = 1 self.winname = "image" self.crop_resize_shape = (400, 400) self.user_press = None
def init_saver(self): trainercore.init_saver(self) if self.args.training and self.args.test_file is not None: self._aux_saver = tensorboard.SummaryWriter( self.args.log_directory + "/test/") else: self._aux_saver = None
def __init__(self, config: Config, rank_print=0): """ only work at specific local_rank process """ super().__init__() self.config = config self.isWork = (rank_print == self.config.local_rank) # ANCHOR tensorboard init if self.isWork: self.tx_writer = tensorboard.SummaryWriter(config.log_file)