class TrainingLogger(object): def __init__(self, log_dir): self._writer = SummaryWriter(log_dir) self._metrics = defaultdict(tf.metrics.Mean) def __getitem__(self, item): return self._metrics[item] def __setitem__(self, key, value): self._metrics[key] = value def log_evaluation_summary(self, summary, step): for k, v in summary.items(): self._writer.add_scalar(k, float(v), step) self._writer.flush() def log_metrics(self, step): print("Training step {} summary:".format(step)) for k, v in self._metrics.items(): print("{:<40} {:<.2f}".format(k, float(v.result()))) self._writer.add_scalar(k, float(v.result()), step) v.reset_states() self._writer.flush() def log_video(self, images, step): video = np.expand_dims(np.transpose(images, [0, 3, 1, 2]), axis=0) self._writer.add_video('Evaluation policy', video, step) self._writer.flush()
def test_tensorboardX(run_manager): wandb.tensorboard.patch(tensorboardX=True) fig = plt.figure() c1 = plt.Circle((0.2, 0.5), 0.2, color='r') ax = plt.gca() ax.add_patch(c1) plt.axis('scaled') writer = SummaryWriter() writer.add_figure('matplotlib', fig, 0) writer.add_video('video', np.random.random(size=(1, 5, 3, 28, 28)), 0) writer.add_scalars('data/scalar_group', {'foo': 10, 'bar': 100}, 1) writer.close() run_manager.test_shutdown() rows = run_manager.run.history.rows events = [] for root, dirs, files in os.walk(run_manager.run.dir): print("ROOT", root, files) for file in files: if "tfevent" in file: events.append(file) assert rows[0]["matplotlib"]['width'] == 640 assert rows[0]["matplotlib"]['height'] == 480 assert rows[0]["matplotlib"]['_type'] == 'images' assert rows[0]["video"]['_type'] == 'videos' assert rows[1]["data/scalar_group/foo"] == 10 assert rows[1]["data/scalar_group/bar"] == 100 assert len(events) == 3
class TrainingLogger: """ Copy-pasted from 'Berkely CS285' (https://github.com/yardenas/berkeley-deep-rl/tree/f741338c085ee5b329f3c9dd05e93e89bc43574a) and used for dumping statistics to to TensorBoard readable file. """ def __init__(self, log_dir, fps): self._log_dir = log_dir self.fps = fps logger.info('Logging training data to: ' + log_dir) self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) def log_scalar(self, scalar, name, step): self._summ_writer.add_scalar('{}'.format(name), scalar, step) def log_scalars(self, scalar_dict, group_name, step): """Will log all scalars in the same plot.""" self._summ_writer.add_scalars('{}'.format(group_name), scalar_dict, step) def log_image(self, image, name, step): assert (len(image.shape) == 3) # [C, H, W] self._summ_writer.add_image('{}'.format(name), image, step) def log_video(self, video_frames, name, step): assert len( video_frames.shape ) == 5, "Need [N, T, C, H, W] input tensor for video logging!" self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=self.fps) def log_figures(self, figure, name, step): """figure: matplotlib.pyplot figure handle""" assert figure.shape[ 0] > 0, "Figure logging requires input shape [batch x figures]!" self._summ_writer.add_figure('{}'.format(name), figure, step) def log_figure(self, figure, name, step): """figure: matplotlib.pyplot figure handle""" self._summ_writer.add_figure('{}'.format(name), figure, step) def log_graph(self, graph, step, phase): """figure: matplotlib.pyplot figure handle""" self._summ_writer.add_graph(graph) def log_histogram(self, data, name, step): self._summ_writer.add_histogram(name, data, step) def dump_scalars(self, log_path=None): log_path = os.path.join( self._log_dir, "scalar_data.json") if log_path is None else log_path self._summ_writer.export_scalars_to_json(log_path) def flush(self): self._summ_writer.flush()
def main(): # Connecting ClearML with the current process, # from here on everything is logged automatically task = Task.init(project_name="examples", task_name="pytorch with video tensorboardX") writer = SummaryWriter("runs") writer.add_text("TEXT", "This is some text", 0) # Make a video that simply fades grey colors video = (torch.sin(torch.arange(0, 1000) / 100) + 1) / 2 * 255 video = video.byte().view(1, -1, 1, 1, 1).expand(1, -1, 3, 64, 64) writer.add_video("my_video", video, 0, fps=50)
class TBXLoggerCallback(TrainingSingleWorkerLoggingCallback): """Logs Train results in TensorboardX format. Args: logdir (Optional[str]): Path to directory where the results file should be. If None, will be set by the Trainer. worker_to_log (int): Worker index to log. By default, will log the worker with index 0. """ VALID_SUMMARY_TYPES: Tuple[type] = (int, float, np.float32, np.float64, np.int32, np.int64) IGNORE_KEYS: Set[str] = {PID, TIMESTAMP, TIME_TOTAL_S, TRAINING_ITERATION} def start_training(self, logdir: str, **info): super().start_training(logdir) try: from tensorboardX import SummaryWriter except ImportError: if log_once("tbx-install"): warnings.warn( "pip install 'tensorboardX' to see TensorBoard files.") raise self._file_writer = SummaryWriter(self.logdir, flush_secs=30) def handle_result(self, results: List[Dict], **info): result = results[self._workers_to_log] step = result[TRAINING_ITERATION] result = {k: v for k, v in result.items() if k not in self.IGNORE_KEYS} flat_result = flatten_dict(result, delimiter="/") path = ["ray", "train"] # same logic as in ray.tune.logger.TBXLogger for attr, value in flat_result.items(): full_attr = "/".join(path + [attr]) if (isinstance(value, self.VALID_SUMMARY_TYPES) and not np.isnan(value)): self._file_writer.add_scalar(full_attr, value, global_step=step) elif ((isinstance(value, list) and len(value) > 0) or (isinstance(value, np.ndarray) and value.size > 0)): # Must be video if isinstance(value, np.ndarray) and value.ndim == 5: self._file_writer.add_video(full_attr, value, global_step=step, fps=20) continue try: self._file_writer.add_histogram(full_attr, value, global_step=step) # In case TensorboardX still doesn't think it's a valid value # (e.g. `[[]]`), warn and move on. except (ValueError, TypeError): if log_once("invalid_tbx_value"): warnings.warn( "You are trying to log an invalid value ({}={}) " "via {}!".format(full_attr, value, type(self).__name__)) self._file_writer.flush() def finish_training(self, error: bool = False, **info): self._file_writer.close()
import wandb import numpy as np from tensorboardX import SummaryWriter wandb.init(tensorboard=True) writer = SummaryWriter() writer.add_video("video", np.random.random(size=(1, 5, 3, 28, 28))) wandb.log({"acc": 1})
class BaseLogger(object): def __init__(self, args, dataset_len, pixel_dict): def round_down(x, m): """Round x down to a multiple of m.""" return int(m * round(float(x) / m)) self.args = args self.batch_size = args.batch_size self.dataset_len = dataset_len self.device = args.device self.img_format = args.img_format self.save_dir = args.save_dir if args.is_training else args.results_dir self.do_classify = args.do_classify self.num_visuals = args.num_visuals self.log_path = os.path.join(self.save_dir, '{}.log'.format(args.name)) log_dir = os.path.join( 'logs', args.name + '_' + datetime.now().strftime('%b%d_%H%M')) self.summary_writer = SummaryWriter(log_dir=log_dir) self.epoch = args.start_epoch # Current iteration in epoch (i.e., # examples seen in the current epoch) self.iter = 0 # Current iteration overall (i.e., total # of examples seen) self.global_step = round_down((self.epoch - 1) * dataset_len, args.batch_size) self.iter_start_time = None self.epoch_start_time = None self.pixel_dict = pixel_dict def _log_scalars(self, scalar_dict, print_to_stdout=True): """Log all values in a dict as scalars to TensorBoard.""" for k, v in scalar_dict.items(): if print_to_stdout: self.write('[{}: {:.3g}]'.format(k, v)) k = k.replace('_', '/') # Group in TensorBoard by phase self.summary_writer.add_scalar(k, v, self.global_step) def _plot_curves(self, curves_dict): """Plot all curves in a dict as RGB images to TensorBoard.""" for name, curve in curves_dict.items(): fig = plt.figure() ax = plt.gca() plot_type = name.split('_')[-1] ax.set_title(plot_type) if plot_type == 'PRC': precision, recall, _ = curve ax.step(recall, precision, color='b', alpha=0.2, where='post') ax.fill_between(recall, precision, step='post', alpha=0.2, color='b') ax.set_xlabel('Recall') ax.set_ylabel('Precision') elif plot_type == 'ROC': false_positive_rate, true_positive_rate, _ = curve ax.plot(false_positive_rate, true_positive_rate, color='b') ax.plot([0, 1], [0, 1], 'r--') ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') else: ax.plot(curve[0], curve[1], color='b') ax.set_ylim([0.0, 1.05]) ax.set_xlim([0.0, 1.0]) fig.canvas.draw() curve_img = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') curve_img = curve_img.reshape((3, ) + fig.canvas.get_width_height()[::-1]) self.summary_writer.add_image(name.replace('_', '/'), curve_img, global_step=self.global_step) def visualize(self, inputs, cls_logits, targets_dict, phase, unique_id=None): """Visualize predictions and targets in TensorBoard. Args: inputs: Inputs to the model. cls_logits: Classification logits predicted by the model. targets_dict: Dictionary of information about the target labels. phase: One of 'train', 'val', or 'test'. unique_id: A unique ID to append to every image title. Allows for displaying all visualizations separately on TensorBoard. Returns: Number of examples visualized to TensorBoard. """ if self.pixel_dict is None: # Set pixel_dict to None to bypass visualization return 0 cls_logits = cls_logits.detach().to('cpu') cls_probs = F.sigmoid(cls_logits).numpy() is_3d = inputs.dim() > 4 num_visualized = 0 for i in range(self.num_visuals): if i >= inputs.shape[0]: break input_np = util.un_normalize(inputs[i], self.img_format, self.pixel_dict) input_np = input_np.astype(np.float32) / 255. mask_np = None output_np = None label = 'abnormal' if targets_dict['is_abnormal'][i] else 'normal' visuals_np = input_np title = 'input' tag = '{}/{}/{}_{}_{:.4f}'.format(phase, title, label, targets_dict['dset_path'][i], cls_probs[i][0]) if unique_id is not None: tag += '_{}'.format(unique_id) # Reshaping to B, C, T, H, W visuals_np = np.expand_dims(visuals_np, 0) if is_3d: self.summary_writer.add_video(tag, visuals_np, self.global_step) else: self.summary_writer.add_image(tag, visuals_np, self.global_step) num_visualized += 1 return num_visualized def write(self, message, print_to_stdout=True): """Write a message to the log. If print_to_stdout is True, also print to stdout.""" with open(self.log_path, 'a') as log_file: log_file.write(message + '\n') if print_to_stdout: print(message) def start_iter(self): """Log info for start of an iteration.""" raise NotImplementedError def end_iter(self): """Log info for end of an iteration.""" raise NotImplementedError def start_epoch(self): """Log info for start of an epoch.""" raise NotImplementedError def end_epoch(self, metrics, curves): """Log info for end of an epoch. Save model parameters and update learning rate.""" raise NotImplementedError
class Logger(object): def __init__(self, log_dir, use_tb=True, config='rl'): self._log_dir = log_dir if use_tb: tb_dir = os.path.join(log_dir, 'tb') if os.path.exists(tb_dir): shutil.rmtree(tb_dir) self._sw = SummaryWriter(tb_dir) else: self._sw = None self._train_mg = MetersGroup( os.path.join(log_dir, 'train.log'), formating=FORMAT_CONFIG[config]['train'] ) self._eval_mg = MetersGroup( os.path.join(log_dir, 'eval.log'), formating=FORMAT_CONFIG[config]['eval'] ) def _try_sw_log(self, key, value, step): if self._sw is not None: self._sw.add_scalar(key, value, step) def _try_sw_log_image(self, key, image, step): if self._sw is not None: assert image.dim() == 3 grid = torchvision.utils.make_grid(image.unsqueeze(1)) self._sw.add_image(key, grid, step) def _try_sw_log_video(self, key, frames, step): if self._sw is not None: frames = torch.from_numpy(np.array(frames)) frames = frames.unsqueeze(0) self._sw.add_video(key, frames, step, fps=30) def _try_sw_log_histogram(self, key, histogram, step): if self._sw is not None: self._sw.add_histogram(key, histogram, step) def log(self, key, value, step, n=1): assert key.startswith('train') or key.startswith('eval') if type(value) == torch.Tensor: value = value.item() self._try_sw_log(key, value / n, step) mg = self._train_mg if key.startswith('train') else self._eval_mg mg.log(key, value, n) def log_param(self, key, param, step): self.log_histogram(key + '_w', param.weight.data, step) if hasattr(param.weight, 'grad') and param.weight.grad is not None: self.log_histogram(key + '_w_g', param.weight.grad.data, step) if hasattr(param, 'bias'): self.log_histogram(key + '_b', param.bias.data, step) if hasattr(param.bias, 'grad') and param.bias.grad is not None: self.log_histogram(key + '_b_g', param.bias.grad.data, step) def log_image(self, key, image, step): assert key.startswith('train') or key.startswith('eval') self._try_sw_log_image(key, image, step) def log_video(self, key, frames, step): assert key.startswith('train') or key.startswith('eval') self._try_sw_log_video(key, frames, step) def log_histogram(self, key, histogram, step): assert key.startswith('train') or key.startswith('eval') self._try_sw_log_histogram(key, histogram, step) def dump(self, step): self._train_mg.dump(step, 'train') self._eval_mg.dump(step, 'eval')
class TBXLogger(Logger): """TensorBoardX Logger. Note that hparams will be written only after a trial has terminated. This logger automatically flattens nested dicts to show on TensorBoard: {"a": {"b": 1, "c": 2}} -> {"a/b": 1, "a/c": 2} """ VALID_HPARAMS = (str, bool, np.bool8, int, np.integer, float, list, type(None)) def _init(self): try: from tensorboardX import SummaryWriter except ImportError: if log_once("tbx-install"): logger.info( "pip install 'ray[tune]' to see TensorBoard files.") raise self._file_writer = SummaryWriter(self.logdir, flush_secs=30) self.last_result = None def on_result(self, result: Dict): step = result.get(TIMESTEPS_TOTAL) or result[TRAINING_ITERATION] tmp = result.copy() for k in [ "config", "pid", "timestamp", TIME_TOTAL_S, TRAINING_ITERATION ]: if k in tmp: del tmp[k] # not useful to log these flat_result = flatten_dict(tmp, delimiter="/") path = ["ray", "tune"] valid_result = {} for attr, value in flat_result.items(): full_attr = "/".join(path + [attr]) if (isinstance(value, tuple(VALID_SUMMARY_TYPES)) and not np.isnan(value)): valid_result[full_attr] = value self._file_writer.add_scalar(full_attr, value, global_step=step) elif ((isinstance(value, list) and len(value) > 0) or (isinstance(value, np.ndarray) and value.size > 0)): valid_result[full_attr] = value # Must be video if isinstance(value, np.ndarray) and value.ndim == 5: self._file_writer.add_video(full_attr, value, global_step=step, fps=20) continue try: self._file_writer.add_histogram(full_attr, value, global_step=step) # In case TensorboardX still doesn't think it's a valid value # (e.g. `[[]]`), warn and move on. except (ValueError, TypeError): if log_once("invalid_tbx_value"): logger.warning( "You are trying to log an invalid value ({}={}) " "via {}!".format(full_attr, value, type(self).__name__)) self.last_result = valid_result self._file_writer.flush() def flush(self): if self._file_writer is not None: self._file_writer.flush() def close(self): if self._file_writer is not None: if self.trial and self.trial.evaluated_params and self.last_result: flat_result = flatten_dict(self.last_result, delimiter="/") scrubbed_result = { k: value for k, value in flat_result.items() if isinstance(value, tuple(VALID_SUMMARY_TYPES)) } self._try_log_hparams(scrubbed_result) self._file_writer.close() def _try_log_hparams(self, result): # TBX currently errors if the hparams value is None. flat_params = flatten_dict(self.trial.evaluated_params) scrubbed_params = { k: v for k, v in flat_params.items() if isinstance(v, self.VALID_HPARAMS) } removed = { k: v for k, v in flat_params.items() if not isinstance(v, self.VALID_HPARAMS) } if removed: logger.info( "Removed the following hyperparameter values when " "logging to tensorboard: %s", str(removed)) from tensorboardX.summary import hparams try: experiment_tag, session_start_tag, session_end_tag = hparams( hparam_dict=scrubbed_params, metric_dict=result) self._file_writer.file_writer.add_summary(experiment_tag) self._file_writer.file_writer.add_summary(session_start_tag) self._file_writer.file_writer.add_summary(session_end_tag) except Exception: logger.exception("TensorboardX failed to log hparams. " "This may be due to an unsupported type " "in the hyperparameter values.")
class BaseAgent: def __init__(self, ENV, logdir_root='logs', n_experience_episodes=1, gamma=0.999, epochs=1, lr=0.001, hidden_layer_neurons=128, EPISODES=2000, eval_period=50, algorithm='REINFORCE', noise=1.0, gif_to_board=False, fps=50, batch_size=128, LOSS_CLIPPING=LOSS_CLIPPING, ENTROPY_LOSS=ENTROPY_LOSS): self.LOSS_CLIPPING = LOSS_CLIPPING self.ENTROPY_LOSS = ENTROPY_LOSS self.hidden_layer_neurons = hidden_layer_neurons self.batch_size = batch_size self.fps = fps self.gif_to_board = gif_to_board self.noise = noise self.last_eval = 0 self.best_return = -np.inf self.eval_period = eval_period self.writer = None self.epsilon = 1e-12 self.logdir_root = logdir_root self.EPISODES = EPISODES self.n_experience_episodes = n_experience_episodes self.episode = 0 self.gamma = gamma self.epochs = epochs self.lr = lr self.logdir = self.get_log_name(ENV, algorithm, logdir_root) self.env = gym.make(ENV) if type(self.env.action_space) != gym.spaces.box.Box: self.nA = self.env.action_space.n else: print('Warning: El espacio de acción es continuo') self.nA = self.env.action_space.shape[0] self.logdir = self.logdir + '_' + str(self.noise) if type(self.env.observation_space) == gym.spaces.box.Box: self.nS = self.env.observation_space.shape[0] else: print('Warning: El espacio de observación no es continuo') self.model_train, self.model_predict = self.get_policy_model( lr=lr, hidden_layer_neurons=hidden_layer_neurons, input_shape=[self.nS], output_shape=self.nA) state_space_samples = np.array( [self.env.observation_space.sample() for x in range(10000)]) self.scaler = sklearn.preprocessing.StandardScaler() self.scaler.fit(state_space_samples) self.reset_env() def get_policy_model(self, lr=0.001, hidden_layer_neurons=128, input_shape=[4], output_shape=2): pass def get_log_name(self, ENV, algorithm, logdir_root): name = logdir_root + '/' name += ENV + '/' + algorithm + '/' name += str(self.n_experience_episodes) + '_' name += str(self.epochs) + '_' name += str(self.batch_size) + '_' name += str(self.gamma) + '_' name += str(self.lr) + '_' + str(int(time())) return name def reset_env(self): # Se suma uno a la cantidad de episodios self.episode += 1 # Se observa el primer estado self.observation = self.env.reset() # Se resetea la lista con los rewards self.reward = [] def get_experience_episodes(self, return_ts=False): # Antes de llamar esta función hay que asegurarse de que el env esta reseteado last_observations = [] observations = [] observations_list = [] actions = [] actions_list = [] predictions = [] predictions_list = [] rewards = [] rewards_list = [] discounted_rewards = [] episodes_returns = [] episodes_lenghts = [] time_steps = [] time_steps_list = [] exp_episodes = 0 ts_count = 0 # Juega n_experience_episodes episodios while exp_episodes < self.n_experience_episodes: # Obtengo acción action, action_one_hot, prediction = self.get_action(eval=False) # Ejecuto acción observation, reward, done, info = self.env.step(action) # Guardo reward obtenido por acción self.reward.append(reward) # Notar que se guarda la observación anterior observations.append(self.observation) actions.append(action_one_hot) predictions.append(prediction.flatten()) rewards.append(reward) self.observation = observation ts_count += 1 time_steps.append(ts_count) if done: observations.append(self.observation) exp_episodes += 1 discounted_reward = self.get_discounted_rewards(self.reward) discounted_rewards.append( np.array(discounted_reward).reshape(-1, 1)) rewards_list.append(np.array(rewards).reshape(-1, 1)) observations_list.append(np.array(observations)) actions_list.append(np.array(actions)) predictions_list.append(np.array(predictions)) time_steps_list.append(np.array(time_steps).reshape(-1, 1)) ep_len = len(discounted_reward) episodes_lenghts.append(ep_len) episodes_returns = episodes_returns + [discounted_reward[0]] last_observations.append(self.observation) self.reset_env() ts_count = 0 rewards = [] observations = [] actions = [] predictions = [] time_steps = [] if return_ts: return observations_list, actions_list, predictions_list, discounted_rewards, rewards_list, np.array( episodes_returns), np.array(episodes_lenghts), time_steps_list else: return observations_list, actions_list, predictions_list, discounted_rewards, rewards_list, np.array( episodes_returns), np.array(episodes_lenghts) def log_data(self, episode, loss, ep_len_mean, entropy, rv, actor_loss, deltaT, ep_return, critic_loss=None): if self.writer is None: self.writer = SummaryWriter(self.logdir) print( f"correr en linea de comando: tensorboard --logdir {self.logdir_root}/" ) print(f'\rEpisode: {episode}', end='') self.writer.add_scalar('loss', loss, episode) self.writer.add_scalar('episode_len', ep_len_mean, episode) if entropy is not None: self.writer.add_scalar('entropy', entropy, episode) self.writer.add_scalar('running_var', rv, episode) self.writer.add_scalar('episode_return', ep_return, episode) if actor_loss is not None: self.writer.add_scalar('actor_loss', actor_loss, episode) self.writer.add_scalar('time', deltaT, episode) if critic_loss is not None: self.writer.add_scalar('critic_loss', critic_loss, episode) if self.episode - self.last_eval >= self.eval_period: if self.gif_to_board: obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, frames = self.get_eval_episode( return_frames=self.gif_to_board) else: obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len = self.get_eval_episode( return_frames=self.gif_to_board) if self.best_return <= ep_returns[-1]: self.best_weights = self.model_predict.get_weights() self.model_predict.save(self.logdir + '.hdf5') print() print( f'Model on episode {self.episode - 1} improved from {self.best_return} to {ep_returns[-1]}. Saved!' ) self.best_return = ep_returns[-1] if self.gif_to_board: video = frames.reshape((1, ) + frames.shape) gif_name = self.logdir.replace('logs/', '').replace( '/', '_') + '_' + str(self.episode) + '_' + str( int(self.best_return * 100) / 100) self.writer.add_video(gif_name, np.rollaxis(video, 4, 2), fps=self.fps) else: print() print( f'Model on episode {self.episode - 1} did not improved {ep_returns[-1]}. Best saved: {self.best_return}' ) # print('Loading best_weights model') # self.model_predict.set_weights(self.best_weights) self.writer.add_scalar('eval_episode_steps', len(obs), self.episode) self.writer.add_scalar('eval_episode_return', ep_returns[-1], episode) self.last_eval = self.episode self.writer.flush def get_eval_episode(self, gif_name=None, fps=50, return_frames=False): frames = [] self.reset_env() observations = [] actions = [] predictions = [] rewards = [] discounted_rewards = [] episodes_returns = [] episodes_lenghts = [] exp_episodes = 0 if gif_name is not None or return_frames: frames.append(self.env.render(mode='rgb_array')) while True: # Juega episodios hasta juntar un tamaño de buffer mínimo action, action_one_hot, prediction = self.get_action(eval=True) observation, reward, done, info = self.env.step(action) self.reward.append(reward) # Notar que se guarda la observación anterior observations.append(self.observation) actions.append(action_one_hot) predictions.append(prediction.flatten()) rewards.append(reward) self.observation = observation if gif_name is not None or return_frames: frames.append(self.env.render(mode='rgb_array')) if done: exp_episodes += 1 discounted_reward = self.get_discounted_rewards(self.reward) discounted_rewards = np.hstack( [discounted_rewards, discounted_reward]) ep_len = len(discounted_reward) episodes_lenghts.append(ep_len) episodes_returns = episodes_returns + [discounted_reward[0] ] * ep_len self.reset_env() if gif_name is not None: clip = mpy.ImageSequenceClip(frames, fps=fps) clip.write_gif(gif_name, fps=fps, verbose=False, logger=None) if return_frames: return np.array(observations), np.array(actions), np.array( predictions), np.array(discounted_rewards), np.array( rewards), np.array(episodes_returns), np.array( episodes_lenghts), np.array(frames) return np.array(observations), np.array(actions), np.array( predictions), np.array(discounted_rewards), np.array( rewards), np.array(episodes_returns), np.array( episodes_lenghts)
dataset = datasets.MNIST('mnist', train=False, download=True) images = dataset.test_data[:100].float() label = dataset.test_labels[:100] features = images.view(100, 784) writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1)) writer.add_embedding(features, global_step=1, tag='noMetadata') dataset = datasets.MNIST('mnist', train=True, download=True) images_train = dataset.train_data[:100].float() labels_train = dataset.train_labels[:100] features_train = images_train.view(100, 784) all_features = torch.cat((features, features_train)) all_labels = torch.cat((label, labels_train)) all_images = torch.cat((images, images_train)) dataset_label = ['test'] * 100 + ['train'] * 100 all_labels = list(zip(all_labels, dataset_label)) writer.add_embedding(all_features, metadata=all_labels, label_img=all_images.unsqueeze(1), metadata_header=['digit', 'dataset'], global_step=2) # VIDEO vid_images = dataset.train_data[:16 * 48] vid = vid_images.view(16, 1, 48, 28, 28) # BxCxTxHxW writer.add_video('video', vid_tensor=vid) writer.close()
val_dataloader = DataLoader(val_dataset, batch_size=args.bs, shuffle=False, num_workers=args.workers, pin_memory=True) if args.ckpt: pass else: # save graph and clips_order samples for data in train_dataloader: #tuple_clips, tuple_orders, tuple_clips_random, tuple_orders_random,idx = data tuple_clips, tuple_orders, idx = data for i in range(args.tl): writer.add_video('train/tuple_clips', tuple_clips[:, i, :, :, :, :], i, fps=8) writer.add_text('train/tuple_orders', str(tuple_orders[:, i].tolist()), i) tuple_clips = tuple_clips.to(device) #writer.add_graph(tcg, tuple_clips) break # save init params at step 0 for name, param in tcg.named_parameters(): writer.add_histogram('params/{}'.format(name), param, 0) n_data = train_dataset.__len__() torch.backends.cudnn.benchmark = True ### loss funciton, optimizer and scheduler ###
class Logger: def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): self._log_dir = log_dir self._n_logged_samples = n_logged_samples if summary_writer is not None: self._summ_writer = summary_writer else: self._summ_writer = SummaryWriter(log_dir) def _loop_batch(self, fn, name, val, *argv, **kwargs): """Loops the logging function n times.""" for log_idx in range(min(self._n_logged_samples, len(val))): name_i = os.path.join(name, "_%d" % log_idx) fn(name_i, val[log_idx], *argv, **kwargs) @staticmethod def _check_size(val, size): if isinstance(val, torch.Tensor) or isinstance(val, np.ndarray): assert len( val.shape ) == size, "Size of tensor does not fit required size, {} vs {}".format( len(val.shape), size) elif isinstance(val, list): assert len( val[0].shape ) == size - 1, "Size of list element does not fit required size, {} vs {}".format( len(val[0].shape), size - 1) else: raise NotImplementedError( "Input type {} not supported for dimensionality check!".format( type(val))) if (val[0].shape[1] > 10000) or (val[0].shape[2] > 10000): raise ValueError("This might be a bit too much") def log_scalar(self, scalar, name, step, phase): self._summ_writer.add_scalar('{}_{}'.format(name, phase), scalar, step) def log_scalars(self, scalar_dict, group_name, step, phase): """Will log all scalars in the same plot.""" self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) def log_images(self, image, name, step, phase): self._check_size(image, 4) # [N, C, H, W] self._loop_batch(self._summ_writer.add_image, '{}_{}'.format(name, phase), image, step) def log_video(self, video_frames, name, step, phase): assert len( video_frames.shape ) == 4, "Need [T, C, H, W] input tensor for single video logging!" if not isinstance(video_frames, torch.Tensor): video_frames = torch.tensor(video_frames) video_frames = torch.transpose(video_frames, 0, 1) # tbX requires [C, T, H, W] video_frames = video_frames.unsqueeze( 0) # add an extra dimension to get grid of size 1 self._summ_writer.add_video('{}_{}'.format(name, phase), video_frames, step) def log_videos(self, video_frames, name, step, phase, fps=3): assert len( video_frames.shape ) == 5, "Need [N, T, C, H, W] input tensor for video logging!" video_frames = video_frames.unsqueeze( 1) # add an extra dimension after batch to get grid of size 1 self._loop_batch(self._summ_writer.add_video, '{}_{}'.format(name, phase), video_frames, step, fps=fps) def log_image(self, images, name, step, phase): self._summ_writer.add_image('{}_{}'.format(name, phase), images, step) def log_image_grid(self, images, name, step, phase, nrow=8): assert len( images.shape ) == 4, "Image grid logging requires input shape [batch, C, H, W]!" img_grid = torchvision.utils.make_grid(images, nrow=nrow) self.log_images(img_grid, '{}_{}'.format(name, phase), step) def log_video_grid(self, video_frames, name, step, phase, fps=3): assert len( video_frames.shape ) == 5, "Need [N, T, C, H, W] input tensor for video logging!" self._summ_writer.add_video('{}_{}'.format(name, phase), video_frames, step, fps=fps) def log_figures(self, figure, name, step, phase): """figure: matplotlib.pyplot figure handle""" assert figure.shape[ 0] > 0, "Figure logging requires input shape [batch x figures]!" self._loop_batch(self._summ_writer.add_figure, '{}_{}'.format(name, phase), figure, step) def log_figure(self, figure, name, step, phase): """figure: matplotlib.pyplot figure handle""" self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) def log_graph(self, array, name, step, phase): """figure: matplotlib.pyplot figure handle""" im = plot_graph(array) self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) def dump_scalars(self, log_path=None): log_path = os.path.join( self._log_dir, "scalar_data.json") if log_path is None else log_path self._summ_writer.export_scalars_to_json(log_path)
seq_test, gt_seq_test = seq_test.to(device), gt_seq_test.to(device) # 送入模型进行推断 test_output = model(seq_test, future=num_frame) # loss计算 test_loss = loss_L1_L2(test_output[:, -num_frame:, :, :, :], gt_seq_test[:, -num_frame:, :, :, :]) test_metric = loss_SSIM(test_output[:, -num_frame:, :, :, :], gt_seq_test[:, -num_frame:, :, :, :]) step_time = time.time() - step_time # 将有用的信息存进tensorboard中 if (step + 1) % print_freq == 0: writer.add_video('train_seq/feed_seq', seq, epoch * train_lenth + step + 1) writer.add_video('train_seq/gt_seq', seq_target, epoch * train_lenth + step + 1) writer.add_video('train_seq/pred_seq', layer_output, epoch * train_lenth + step + 1) writer.add_video('test_seq/feed_seq', seq_test, epoch * train_lenth + step + 1) writer.add_video('test_seq/gt_seq', gt_seq_test, epoch * train_lenth + step + 1) writer.add_video('test_seq/pred_seq', test_output, epoch * train_lenth + step + 1) writer.add_scalars( 'loss/merge', { "train_loss": train_loss, "test_loss": test_loss, "train_metric": train_metric,
def train_deepq(name, env, nb_actions, Q_network, preprocess_fn=None, batch_size=32, replay_start_size=50000, replay_memory_size=50000, agent_history_length=4, target_network_update_frequency=10000, discount_factor=0.99, learning_rate=1e-5, update_frequency=4, inital_exploration=1, final_exploration=0.1, final_exploration_step=int(1e6), nb_timesteps=int(1e7), tensorboard_freq=50, demo_tensorboard=False): #SAVE/LOAD MODEL DIRECTORY_MODELS = './models/' if not os.path.exists(DIRECTORY_MODELS): os.makedirs(DIRECTORY_MODELS) PATH_SAVE = DIRECTORY_MODELS + name + '_' + time.strftime('%Y%m%d-%H%M') #GPU/CPU device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') print('RUNNING ON', device) #TENSORBOARDX writer = SummaryWriter(comment=name) replay_memory = init_replay_memory(env, replay_memory_size, replay_start_size, preprocess_fn) print('#### TRAINING ####') print('see more details on tensorboard') done = True #reset environment eps_schedule = ScheduleExploration(inital_exploration, final_exploration, final_exploration_step) Q_network = Q_network.to(device) Q_hat = copy.deepcopy(Q_network).to(device) loss = SmoothL1Loss() optimizer = RMSprop(Q_network.parameters(), lr=learning_rate, alpha=0.95, eps=0.01, centered=True) episode = 1 rewards_episode, total_reward_per_episode = list(), list() for timestep in tqdm(range(nb_timesteps)): #tqdm #if an episode is ended if done: total_reward_per_episode.append(np.sum(rewards_episode)) rewards_episode = list() phi_t = env.reset() if preprocess_fn: phi_t = preprocess_fn(phi_t) if (episode % tensorboard_freq == 0): assert len(total_reward_per_episode) == tensorboard_freq #tensorboard writer.add_scalar('rewards/train_reward', np.mean(total_reward_per_episode), episode) total_reward_per_episode = list() writer.add_scalar('other/replay_memory_size', len(replay_memory), episode) writer.add_scalar('other/eps_exploration', eps_schedule.get_eps(), episode) if demo_tensorboard: demos, demo_rewards = play(env, Q_network, preprocess_fn, nb_episodes=1, eps=eps_schedule.get_eps()) writer.add_scalar('rewards/demo_reward', np.mean(demo_rewards), episode) for demo in demos: demo = demo.permute([3, 0, 1, 2]).unsqueeze(0) writer.add_video(name, demo, episode, fps=25) #save model torch.save(Q_network.state_dict(), PATH_SAVE) episode += 1 a_t = get_action(phi_t, env, Q_network, eps_schedule) phi_t_1, r_t, done, info = env.step(a_t) rewards_episode.append(r_t) if preprocess_fn: phi_t_1 = preprocess_fn(phi_t_1) replay_memory.push([phi_t, a_t, r_t, phi_t_1, done]) phi_t = phi_t_1 #training if timestep % update_frequency == 0: #get training data phi_t_training, actions_training, y = get_training_data( Q_hat, replay_memory, batch_size, discount_factor) #forward phi_t_training = phi_t_training.to(device) Q_values = Q_network(phi_t_training) mask = torch.zeros([batch_size, nb_actions]).to(device) for j in range(len(actions_training)): mask[j, actions_training[j]] = 1 Q_values = Q_values * mask Q_values = torch.sum(Q_values, dim=1) output = loss(Q_values, y) #backward and gradient descent optimizer.zero_grad() output.backward() optimizer.step() if timestep % target_network_update_frequency == 0: Q_hat = copy.deepcopy(Q_network).to(device)
class Logger(object): def __init__(self, log_folder, tensorboard_dir, log_interval): log_file = str(log_folder / 'log') self.logger = logzero.setup_logger( name='main', logfile=log_file, level=20, fileLoglevel=10, formatter=None, ) self.metrics = { "epoch": 0, "iteration": 1, "loss_gen": 0., "loss_idis": 0., "loss_vdis": 0., "elapsed_time": 0, } self.log_interval = log_interval self.writer = SummaryWriter(str(tensorboard_dir)) self.start_time = time.time() self.display_metric_names() def display_metric_names(self): log_string = "" for name in self.metrics.keys(): log_string += "{:>12} ".format(name) self.logger.info(log_string) def init(self): targets = ["loss_gen", "loss_idis", "loss_vdis"] for name in targets: self.metrics[name] = 0. def update(self, name, value): self.metrics[name] += value def log(self): # display and save logs self.metrics["elapsed_time"] = time.time() - self.start_time metric_strings = [] for name, value in self.metrics.items(): if name in ["epoch", "iteration"]: s = "{}".format(value) elif name in ["loss_gen", "loss_idis", "loss_vdis"]: s = "{:0.3f}".format(value/self.log_interval) elif name in ["elapsed_time"]: value = int(value) s = "{:02d}:{:02d}:{:02d}".format(value//3600, value//60, value%60) else: raise Exception("Unsupported mertic is added") metric_strings.append(s) log_string = "" for s in metric_strings: log_string += "{:>12} ".format(s) self.logger.info(log_string) def tf_log(self): step = self.metrics["iteration"] for name in ["loss_gen", "loss_idis", "loss_vdis"]: value = self.metrics[name]/self.log_interval self.writer.add_scalar(name, value, step) def tf_log_video(self, name, videos, step): self.writer.add_video(name, videos, fps=8, global_step=step) def tf_log_histgram(self, var, tag, step): var = var.clone().cpu().data.numpy() self.writer.add_histogram(tag, var, step)
class FPTrainer(): def __init__(self, opt): self.opt = opt self.save_dir = opt.log_dir self.dataset = opt.dataset self.batch_size = opt.batch_size self.patch_size = opt.patch_size self.dtype = torch.cuda.FloatTensor self.start_epoch = 0 self.eta = 1.0 self.total_iter = 0 num_iters = int(opt.total_epoch * opt.epoch_size / 2) self.delta = float(1) / num_iters self.seq_len = opt.seq_len self.pre_len = opt.pre_len self.eval_len = opt.eval_len self.input_nc = opt.input_nc self.output_nc = opt.output_nc self.epoch_size = opt.epoch_size self.shape = [ int(opt.image_width / opt.patch_size), int(opt.image_height / opt.patch_size) ] self.rnn_size = opt.rnn_size self.rnn_nlayer = opt.rnn_nlayer self.filter_size = opt.filter_size ic = self.input_nc * opt.patch_size**2 oc = self.output_nc * opt.patch_size**2 # tensorboard # ---------------- visualization with tensorboardX ---------- train_log_dir = os.path.join(self.save_dir, 'runs/train') if not os.path.exists(train_log_dir): os.mkdir(train_log_dir) self.writer_train = SummaryWriter(log_dir=train_log_dir) test_log_dir = os.path.join(self.save_dir, 'runs/test') if not os.path.exists(test_log_dir): os.mkdir(test_log_dir) self.writer_test = SummaryWriter(log_dir=test_log_dir) # setting dataset train_data, valid_data, test_data = utils.load_dataset(opt) self.train_loader = DataLoader(train_data, num_workers=opt.data_threads, batch_size=opt.batch_size, shuffle=True, drop_last=True, pin_memory=True) self.test_loader = DataLoader(test_data, num_workers=opt.data_threads, batch_size=opt.batch_size, shuffle=False, drop_last=False, pin_memory=True) self.valid_loader = DataLoader(valid_data, num_workers=opt.data_threads, batch_size=opt.batch_size, shuffle=False, drop_last=False, pin_memory=True) def get_training_batch(): while True: for sequence in self.train_loader: batch = utils.normalize_data(opt, self.dtype, sequence) yield batch self.training_batch_generator = get_training_batch() def get_testing_batch(): while True: for sequence in self.test_loader: batch = utils.normalize_data(opt, self.dtype, sequence) yield batch self.testing_batch_generator = get_testing_batch() # set model self.model = get_convrnn_model(opt.model, input_chans=ic, output_chans=oc, hidden_size=self.rnn_size, filter_size=self.filter_size, num_layers=self.rnn_nlayer, img_size=opt.image_height // opt.patch_size) self.model.cuda() # set optimizer if opt.optimizer == 'adam': optimizer = optim.Adam elif opt.optimizer == 'rmsprop': optimizer = optim.RMSprop elif opt.optimizer == 'sgd': optimizer = optim.SGD elif opt.optimizer == 'adamw': optimizer = optim.AdamW else: raise ValueError('Unknown optimizer: %s' % opt.optimizer) self.optimizer = optimizer(self.model.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) self.scheduler = utils.get_scheduler( self.optimizer, self.opt, (opt.total_epoch - self.start_epoch)) # load model if opt.resume: if not os.path.isfile(opt.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( opt.resume)) checkpoint = torch.load(opt.resume) self.start_epoch = checkpoint['epoch'] self.model.load_state_dict(checkpoint['state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) self.eta = checkpoint['eta'] self.total_iter = self.start_epoch * self.epoch_size print("=> loaded checkpoint '{}' (epoch {})".format( opt.resume, checkpoint['epoch'])) # criterion if opt.criterion == "L1": self.criterion = torch.nn.L1Loss(size_average=False, reduce=True).cuda() print('The criterion type is L1!') elif opt.criterion == "BCE": self.criterion = torch.nn.BCELoss(size_average=False, reduce=True).cuda() print('The criterion type is BCE!') elif opt.criterion == "MSE&L1": self.criterion = MSEL1Loss(size_average=False, reduce=True, alpha=1.0).cuda() print('The criterion type is MSE + L1!') else: self.criterion = torch.nn.MSELoss(size_average=False, reduce=True).cuda() print('The criterion type is MSE!') # Print networks print('---------- Networks initialized -------------') utils.print_network(self.model, opt.model) def name(self): return 'Frame Prediction Trainer' def set_input(self, input): # X: len, batchsize, inchains, size(0), size(1) if self.patch_size > 1: x = [utils.reshape_patch(img, self.patch_size) for img in input] else: x = input reverse_x = x[::-1] random_flip = np.random.random_sample( (self.pre_len - 1, self.batch_size)) true_token = (random_flip < self.eta) one = torch.FloatTensor(1, x[0].size(1), x[0].size(2), x[0].size(3)).fill_(1.0).cuda() zero = torch.FloatTensor(1, x[0].size(1), x[0].size(2), x[0].size(3)).fill_(0.0).cuda() masks = [] for t in range(self.pre_len - 1): masks_b = [] for i in range(self.batch_size): if true_token[t, i]: masks_b.append(one) else: masks_b.append(zero) mask = torch.cat(masks_b, 0) # along batchsize masks.append(mask) return x, reverse_x, masks def forward(self, x, mask): gen_ims = [] x_gen = self.model(x[0], init_hidden=True) for t in range(1, self.seq_len + self.pre_len - 1): if t < self.seq_len: inputs = x[t] else: inputs = mask[t - self.seq_len] * x[t] + ( 1 - mask[t - self.seq_len]) * x_gen x_gen = self.model(inputs, init_hidden=False) gen_ims.append(x_gen) gen_ims = torch.stack(gen_ims, 1) images = torch.stack(x[2:], 1) loss = self.criterion(gen_ims, images) loss /= 2.0 return loss def save_checkpoint(self, checkpoint, network_label, epoch_label): save_filename = '%s_%s_net_%s.pth.tar' % (self.dataset, network_label, epoch_label) save_path = os.path.join(self.save_dir, save_filename) torch.save(checkpoint, save_path) def train_epoch(self, epoch): self.model.train() # epoch_rec = 0.0 info_dict = {'loss': 0.0} describe = '[' + self.opt.model + ',' + self.opt.dataset + ']:' + 'Epoch ' + str( epoch) pbar = tqdm(total=self.epoch_size, desc=describe) for i in range(self.epoch_size): x = next(self.training_batch_generator) loss = self.optimize_parameters(x) # epoch_rec += loss with open(os.path.join( self.save_dir, 'train_loss_%s_%s.txt' % (self.opt.model, self.opt.dataset)), mode='a') as f: f.write('%0.8f \n' % (loss)) self.total_iter += 1 self.writer_train.add_scalar('Train/loss', loss, self.total_iter) self.writer_train.add_scalar('Train/Eta', self.eta, self.total_iter) self.eta -= self.delta self.eta = max(self.eta, 0.0) info_dict['loss'] = loss pbar.set_postfix(info_dict) pbar.update(1) pbar.close() # save epoch self.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'eta': self.eta, }, self.opt.model, 'last') self.scheduler.step() lr = self.scheduler.get_last_lr()[0] print('learning rate = %.7f' % lr) def test(self, epoch): self.model.eval() rec = 0 result_path = os.path.join(self.save_dir, 'results', str(epoch)) if not os.path.exists(result_path): os.mkdir(result_path) psnr = np.zeros(self.eval_len) ssim = np.zeros(self.eval_len) mae = np.zeros(self.eval_len) sharp = np.zeros(self.eval_len) mse = np.zeros(self.eval_len) index = 0 total_index = 0 describe = '[Testing]:Epoch ' + str(epoch) pbar = tqdm(total=len(self.test_loader), desc=describe) for batch in self.test_loader: # for batch in self.test_loader: x = utils.normalize_data(self.opt, self.dtype, batch) rec += self.evaluation(x) index += 1 total_index += x[0].size(0) # bs gt = [] pred = [] for i in range(self.eval_len): x1 = x[i + self.seq_len].data.cpu().numpy() x2 = self.preds[i + self.seq_len].data.cpu().numpy() gt.append(x1) pred.append(x2) mse_, mae_, ssim_, psnr_, sharp_ = utils.eval_seq_batch(gt, pred) mse += mse_ mae += mae_ ssim += ssim_ psnr += psnr_ sharp += sharp_ if index < 11: path = os.path.join(result_path, str(index)) if not os.path.exists(path): os.mkdir(path) for i in range(self.seq_len + self.eval_len): name = 'gt' + str(i + 1) + '.png' file_name = os.path.join(path, name) img_gt = x[i][0].data.cpu() img_gt = img_gt.transpose(0, 1).transpose(1, 2).numpy() img_gt = np.uint8(img_gt * 255) if 2 in img_gt.shape: cv2.imwrite(file_name, img_gt[:, :, :1]) continue cv2.imwrite(file_name, img_gt) for i in range(self.eval_len): name = 'pd' + str(i + self.seq_len + 1) + '.png' file_name = os.path.join(path, name) img_pd = self.preds[i + self.seq_len][0].data.cpu() img_pd = img_pd.transpose(0, 1).transpose(1, 2).clamp(0, 1).numpy() img_pd = np.uint8(img_pd * 255) if 2 in img_pd.shape: cv2.imwrite(file_name, img_pd[:, :, :1]) continue cv2.imwrite(file_name, img_pd) if index == 1: gt = torch.stack(x, dim=1) # B, T, C, H, W pd = torch.stack(self.preds, dim=1) gif = torch.cat([gt, pd], dim=0) # cat along batch gif = gif.data.cpu().clamp(0, 1) self.writer_test.add_video('Test/gt&pred', gif, epoch) pbar.update(1) pbar.close() rec = rec / index mse /= total_index mae /= total_index ssim /= total_index psnr /= total_index sharp /= total_index # ----------- log the frame-wise measurement with open(os.path.join(self.save_dir, 'test_result_%s.txt' % (self.dataset)), mode='a') as f: f.write( '####################### frame-wise results at epoch: %04d ####################### \n' % (epoch)) f.write('- mse: mean %04f -' % (np.mean(mse))) for t in range(self.eval_len): f.write('-[%d: %04f]-' % (t + self.pre_len, mse[t])) f.write('\n') f.write('- mae: mean %04f -' % (np.mean(mae))) for t in range(self.eval_len): f.write('[%d: %04f] ' % (t + self.pre_len, mae[t])) f.write('\n') f.write('- ssim: mean %04f -' % (np.mean(ssim))) for t in range(self.eval_len): f.write('[%d: %04f] ' % (t + self.pre_len, ssim[t])) f.write('\n') f.write('- psnr: mean %04f -' % (np.mean(psnr))) for t in range(self.eval_len): f.write('[%d: %04f] ' % (t + self.pre_len, psnr[t])) f.write('\n') f.write('- sharp: mean %04f -' % (np.mean(sharp))) for t in range(self.eval_len): f.write('[%d: %04f] ' % (t + self.pre_len, sharp[t])) f.write('\n') self.writer_test.add_scalar('Test/rec_loss', rec, epoch) self.writer_test.add_scalar('Test/mse', np.mean(mse), epoch) self.writer_test.add_scalar('Test/mae', np.mean(mae), epoch) self.writer_test.add_scalar('Test/ssim', np.mean(ssim), epoch) self.writer_test.add_scalar('Test/psnr', np.mean(psnr), epoch) self.writer_test.add_scalar('Test/sharp', np.mean(sharp), epoch) def evaluation(self, x): # patch if self.patch_size > 1: x = [utils.reshape_patch(img, self.patch_size) for img in x] # -------------------------------------------------------------- loss = 0.0 self.preds = [x[0], x[1]] x_gen = self.model(x[0], init_hidden=True) gen_ims = [] for t in range(1, self.seq_len + self.eval_len - 1): if t < self.seq_len: inputs = x[t] else: inputs = x_gen x_gen = self.model(inputs, init_hidden=False) gen_ims.append(x_gen) # ----------------------------------------------------------- if t < self.seq_len - 1: self.preds.append(x[t + 1]) else: self.preds.append(x_gen) # unpatch if self.patch_size > 1: self.preds = [ utils.reshape_patch_back(img, self.patch_size) for img in self.preds ] gen_ims = torch.stack(gen_ims, 1) images = torch.stack(x[2:], 1) loss = self.criterion(gen_ims, images) loss /= 2.0 # for consistency with tensorflow return loss.detach().to("cpu").item() / ( self.batch_size * (self.seq_len + self.eval_len - 2)) def optimize_parameters(self, x): x, x_rev, mask = self.set_input(x) self.optimizer.zero_grad() self.loss_1 = self.forward(x, mask) self.loss_1.backward() self.optimizer.step() self.optimizer.zero_grad() self.loss_2 = self.forward(x_rev, mask) self.loss_2.backward() self.optimizer.step() loss = (self.loss_1 + self.loss_2) / 2 return loss.detach().to("cpu").item() def finish(self): self.preds = [] self.writer_train.export_scalars_to_json( os.path.join(self.save_dir, 'runs', 'train_all_scalars.json')) self.writer_train.close() self.writer_test.export_scalars_to_json( os.path.join(self.save_dir, 'runs', 'test_all_scalars.json')) self.writer_test.close()
def main2(): model = c3d.C3D(with_classifier=True, num_classes=101) start_epoch = 1 # load 16 compressed video frames train_coviar = CoviarData( '/data2/fb/project/pytorch-coviar-master/data/ucf101/mpeg4_videos', 'ucf101', '/data2/fb/project/pytorch-coviar-master/data/datalists/ucf101_split1_train.txt', 'residual', get_augmentation(), 4, 1, True) val_size = 400 train_dataset, val_dataset = random_split( train_coviar, (len(train_coviar) - val_size, val_size)) print("num_workers:{:d}".format(params['num_workers'])) print("batch_size:{:d}".format(params['batch_size'])) train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True, num_workers=params['num_workers']) val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=True, num_workers=params['num_workers']) model = model.cuda() criterion = nn.CrossEntropyLoss.cuda() optimizer = optim.SGD(model.parameters(), lr=params['learning_rate'], momentum=params['momentum'], weight_decay=params['weight_decay']) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', min_lr=1e-5, patience=20, factor=0.1) model_save_dir = os.path.join(save_path, '_', time.strftime('%m-%d-%H-%M')) writer = SummaryWriter(model_save_dir) for data in train_loader: clip, label = data writer.add_video('train/clips', clip, 0, fps=8) writer.add_text('train/idx', str(label.tolist()), 0) clip = clip.cuda() break for name, param in model.named_parameters(): writer.add_histogram('params/{}', format(name), param, 0) if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) prev_best_val_loss = float('inf') prev_best_loss_model_path = None prev_best_acc_model_path = None best_acc = 0 best_epoch = 0 for epoch in tqdm(range(start_epoch, start_epoch + params['epoch_num'])): train_coviar(train_loader, model, criterion, optimizer, epoch, params['learning_rate']) val_loss, top1_avg = validation(val_loader, model, criterion, optimizer, epoch) if top1_avg >= best_acc: best_acc = top1_avg best_epoch = epoch model_path = os.path.join( model_save_dir, 'best_acc_model_{}.pth.tar'.format(epoch)) torch.save(model.state_dict(), model_path) prev_best_acc_model_path = model_path if val_loss < prev_best_val_loss: model_path = os.path.join( model_save_dir, 'best_loss_model_{}.pth.tar'.format(epoch)) torch.save(model.state_dict(), model_path) prev_best_val_loss = val_loss prev_best_loss_model_path = model_path scheduler.step(val_loss) if epoch % 20 == 0: checkpoints = os.path.join(model_save_dir, str(epoch) + ".pth.tar") torch.save(model.state_dict(), checkpoints) print("save_to:", checkpoints) print("best is :", best_acc, best_epoch)
writer.add_embedding(features, global_step=1, tag='noMetadata') # 在 PyTorch_Tutorial/Code 目录下运行 # dataset = datasets.MNIST(os.path.join("..", "..", "Data", "mnist"), # 在 PyTorch_Tutorial 目录下运行 dataset = datasets.MNIST(os.path.join(".", "Data", "mnist"), train=True, download=True) images_train = dataset.train_data[:100].float() labels_train = dataset.train_labels[:100] features_train = images_train.view(100, 784) all_features = torch.cat((features, features_train)) all_labels = torch.cat((label, labels_train)) all_images = torch.cat((images, images_train)) dataset_label = ['test'] * 100 + ['train'] * 100 all_labels = list(zip(all_labels, dataset_label)) writer.add_embedding(all_features, metadata=all_labels, label_img=all_images.unsqueeze(1), metadata_header=['digit', 'dataset'], global_step=2) # VIDEO vid_images = dataset.train_data[:16 * 48] vid = vid_images.view(16, 1, 48, 28, 28) # BxCxTxHxW writer.add_video('video', vid_tensor=vid) writer.add_video('video_1_fps', vid_tensor=vid, fps=1) writer.close()
import torch from tensorboardX import SummaryWriter import torchvision logger = SummaryWriter(comment='feature_vis') feature = torch.load('pev_feature_bpi3d_rgb_result.pt', map_location=lambda storage, loc: storage) for k, v in feature.items(): min = v.min() max = v.max() v.clamp_(min=min, max=max) v.add_(-min).div_(max - min + 1e-5) imgs = v.permute(1, 0, 2, 3).reshape(-1, 1, 14, 14) logger.add_image('feature/img', torchvision.utils.make_grid(imgs, nrow=8), k.item()) v = v.view(64, 8, 1, 14, 14) logger.add_video('feature/video', v, k.item())
# create video every 100 episodes if ((i_episode % 100) == 0): policy_net.eval() env.reset() current_screen = get_screen() state = current_screen episode_video_frames = [] for t in count(): action = select_action(state, update_step=False) _, _, done = env.step(action.item()) obs = get_screen() episode_video_frames.append(obs.cpu().numpy()) if (done or t > 3000): break # stacked with T, C, H, W #T, H, W, C # pdb.set_trace() stacked_frames = np.stack(episode_video_frames).transpose( 1, 0, 2, 3) stacked_frames = np.expand_dims(stacked_frames, 0) # video takes B, C, T, H, W writer.add_video('video/episode', stacked_frames, i_episode) if (total_frame_count > NUM_FRAMES): torch.save(policy_net, MODEL_PATH) break print('Reward this episode:', total_reward) print('Complete')
class TBVisualizer: def __init__(self, opt): self._opt = opt self._save_path = os.path.join(opt["dirs"]["exp_dir"], opt["dirs"]["events"]) self._log_path = os.path.join(self._save_path, 'loss_log.txt') self._tb_path = os.path.join(self._save_path, 'summary.json') # create summary writers self._writer_full = SummaryWriter(self._save_path, filename_suffix="_full") # init log file with header self._init_log_file() def __del__(self): self._writer_full.close() def _init_log_file(self): with open(self._log_path, "a") as log_file: now = time.strftime("%c") log_file.write( '================ Training Loss (%s) ================\n' % now) def display_current_results(self, visuals, it, is_train, save_visuals=False): # add visuals to events file for label, image_numpy in visuals.items(): sum_name = '{}/{}'.format('Train' if is_train else 'Val', label) if image_numpy.ndim == 3: self._writer_full.add_image(sum_name, image_numpy, it) else: self._writer_full.add_video(sum_name, image_numpy, it) # save image to file if save_visuals: util.save_image( image_numpy, os.path.join(self._opt.checkpoints_dir, self._opt.name, 'event_imgs', sum_name, '%08d.png' % it)) # force write self._writer_full.file_writer.flush() def plot_scalars(self, scalars, it, is_train, is_mean=False): for label, scalar in scalars.items(): # set labels if is_mean: label = f"M_{label}" sum_name = '{}/{}'.format('Train' if is_train else 'Val', label) # add scalars to events file self._writer_full.add_scalar(sum_name, scalar, it) def plot_histograms(self, histograms, it, is_train, is_mean=False): for label, scalar in histograms.items(): # set labels if is_mean: label = f"M_{label}" sum_name = '{}/{}'.format('Train' if is_train else 'Val', label) # add hist to events file self._writer_full.add_histogram(sum_name, scalar, it) # force write self._writer_full.file_writer.flush() def plot_time(self, read_time, train_time, it): # add scalars to events file self._writer_full.add_scalar("read_time", read_time, it) self._writer_full.add_scalar("train_time", train_time, it) def print_current_train_errors(self, epoch, i, iters_per_epoch, errors, iter_read_time, iter_procs_time, visuals_were_stored): # set label log_time = time.strftime("[%d/%m %H:%M:%S]") visuals_info = "v" if visuals_were_stored else "" message = '%s (T%s, epoch: %d, it: %d/%d, s/smpl: %.3fr %.3fp) ' % ( log_time, visuals_info, epoch, i, iters_per_epoch, iter_read_time, iter_procs_time) # print in terminal and store in log file self._print_and_store_errors(errors, message) def print_current_validate_errors(self, epoch, errors, t): # set label log_time = time.strftime("[%d/%m/%Y %H:%M:%S]") message = '%s (V, epoch: %d, time_to_val: %ds) ' % (log_time, epoch, t) # print in terminal and store in log file self._print_and_store_errors(errors, message) def print_epoch_avg_errors(self, epoch, errors, is_train): # set label label = "MT" if is_train else "MV" log_time = time.strftime("[%d/%m/%Y %H:%M:%S]") message = '%s (%s, epoch: %d) ' % (log_time, label, epoch) # print in terminal and store in log file self._print_and_store_errors(errors, message) def _print_and_store_errors(self, errors, message): # set errors msg for k, v in errors.items(): message += '%s:%.3f ' % (k, v) # print in terminal and store in log file print(message) self._save_log(message) def print_msg(self, message): # set label log_time = time.strftime("[%d/%m/%Y %H:%M:%S]") message = '%s %s' % (log_time, message) # print in terminal and store in log file print(message) self._save_log(message) def save_images(self, visuals): for label, image_numpy in visuals.items(): image_name = '%s.png' % label save_path = os.path.join(self._save_path, "samples", image_name) util.save_image(image_numpy, save_path) def _save_log(self, msg): with open(self._log_path, "a") as log_file: log_file.write('%s\n' % msg)
class ExperimentLogger(object): """Wraps Tensorboard logger and lightweight logger. (and standard output logger) TensorBoard logger can be useful for looking at all experiments Performs checkpointing and resumes experiments that are not completed. - serves to combine other loggers """ def __init__(self, log_dir, checkpoint_name="latest.tar", log_std_out=False, use_tensorboard=True): self.log_dir = log_dir self.logger = Logger(log_dir=log_dir) self.checkpoint_name = os.path.join(self.log_dir, checkpoint_name) self.use_tensorboard = use_tensorboard if use_tensorboard: self.tensorboard_writer = SummaryWriter(log_dir=self.log_dir) self.log_std_out = log_std_out if log_std_out: std_out_file = os.path.join(log_dir, "std_out.txt") self.std_out_logger = set_print_logger("marl", std_out_file) def info(self, msg): """ mimic logging.logger """ if self.log_std_out: self.std_out_logger.info(msg) else: print(msg) def update_tensorboard_writer(self, epoch): self.tensorboard_writer = SummaryWriter(log_dir=self.log_dir, purge_step=epoch) def checkpoint_exists(self): return os.path.isfile(self.checkpoint_name) def load_checkpoint(self): return torch.load(self.checkpoint_name) def log_hyperparams(self, hyp_dict, file_name="hyperparams.json"): with open(os.path.join(self.log_dir, file_name), 'w') as fp: json.dump(hyp_dict, fp) def add_scalar(self, name, val, iter): self.logger.log(name, val, iter) if self.use_tensorboard: self.tensorboard_writer.add_scalar(name, val, iter) def add_histogram(self, name, val_array, iter): self.tensorboard_writer.add_histogram(name, val_array, iter) def add_histogram_dict(self, val_dict, iter): """ take in dict of named parameters (e.g. network weights) """ if self.use_tensorboard: for k, v in val_dict.items(): self.add_histogram(k, v, iter) def add_images(self, name, image_tensor, iter): """ images: (N,C,H,W) """ self.tensorboard_writer.add_images(name, image_tensor, iter) def add_video(self, name, video_tensor, iter): """ videos: (T,H,W,C) """ self.tensorboard_writer.add_video(name, video_tensor, iter, fps=4) def log_video(self, name, video, fps=20): """ video: rgb arrays reference: https://imageio.readthedocs.io/en/stable/format_gif-pil.html """ vid_kargs = { 'fps': fps # duration per frame } vid_name = '{}/{}'.format(self.log_dir, name) mkdirs(os.path.dirname(vid_name)) # often is "videos/" imageio.mimsave(vid_name, video, **vid_kargs) def log_epoch(self, epoch, state, epoch_stats): assert 'epoch' in state assert 'model' in state assert 'optimizer' in state torch.save(state, self.checkpoint_name) for k, v in epoch_stats.items(): self.add_scalar(k, v, epoch) def export_scalars_to_json(self, summary_path="summary.json"): self.tensorboard_writer.export_scalars_to_json(self.log_dir + "/" + summary_path) def close(self): if self.use_tensorboard: self.tensorboard_writer.close()
class TBXLoggerCallback(TrainingCallback): """Logs Train results in TensorboardX format. Args: logdir (Optional[str]): Path to directory where the results file should be. If None, will be set by the Trainer. worker_to_log: Worker index to log. By default, will log the worker with index 0. """ VALID_SUMMARY_TYPES: Tuple[type] = ( int, float, np.float32, np.float64, np.int32, np.int64, ) IGNORE_KEYS: Set[str] = {PID, TIMESTAMP, TIME_TOTAL_S} def __init__(self, logdir: Optional[str] = None, worker_to_log: int = 0) -> None: warnings.warn( _deprecation_msg, DeprecationWarning, ) self._logdir_manager = _TrainCallbackLogdirManager(logdir=logdir) results_preprocessors = [ IndexedResultsPreprocessor(indices=worker_to_log), ExcludedKeysResultsPreprocessor(excluded_keys=self.IGNORE_KEYS), ] self.results_preprocessor = SequentialResultsPreprocessor( results_preprocessors) def start_training(self, logdir: str, **info): self._logdir_manager.setup_logdir(default_logdir=logdir) try: from tensorboardX import SummaryWriter except ImportError: if log_once("tbx-install"): warnings.warn( "pip install 'tensorboardX' to see TensorBoard files.") raise self._file_writer = SummaryWriter(str(self.logdir), flush_secs=30) def handle_result(self, results: List[Dict], **info): result = results[0] # Use TRAINING_ITERATION for step but remove it so it is not logged. step = result.pop(TRAINING_ITERATION) flat_result = flatten_dict(result, delimiter="/") path = ["ray", "train"] # same logic as in ray.tune.logger.TBXLogger for attr, value in flat_result.items(): full_attr = "/".join(path + [attr]) if isinstance(value, self.VALID_SUMMARY_TYPES) and not np.isnan(value): self._file_writer.add_scalar(full_attr, value, global_step=step) elif (isinstance(value, list) and len(value) > 0) or (isinstance(value, np.ndarray) and value.size > 0): # Must be video if isinstance(value, np.ndarray) and value.ndim == 5: self._file_writer.add_video(full_attr, value, global_step=step, fps=20) continue try: self._file_writer.add_histogram(full_attr, value, global_step=step) # In case TensorboardX still doesn't think it's a valid value # (e.g. `[[]]`), warn and move on. except (ValueError, TypeError): if log_once("invalid_tbx_value"): warnings.warn( "You are trying to log an invalid value ({}={}) " "via {}!".format(full_attr, value, type(self).__name__)) self._file_writer.flush() def finish_training(self, error: bool = False, **info): self._file_writer.close() @property def logdir(self) -> Path: return self._logdir_manager.logdir_path
def main(): base = c3d.C3D(with_classifier=False) model = ssl_net.SSLNET(base, with_classifier=True, num_classes=12) start_epoch = 1 # pretrain_weight = loadcontinur_weights(pretrain_path) # model.load_state_dict(pretrain_weight, strict=False) # train train_dataset = UntrimmedVideoDataset(params['root'], mode="train") if params['data'] == 'UCF-101': val_size = 800 elif params['data'] == 'hmdb': val_size = 400 elif params['data'] == 'Thumos14': val_size = 400 train_dataset, val_dataset = random_split( train_dataset, (len(train_dataset) - val_size, val_size)) print("num_works:{:d}".format(params['num_workers'])) print("batch_size:{:d}".format(params['batch_size'])) train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True, num_workers=params['num_workers']) val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=True, num_workers=params['num_workers']) model = nn.DataParallel(model) #multi-gpu model = model.cuda() criterion = nn.CrossEntropyLoss().cuda() optimizer = optim.SGD(model.parameters(), lr=params['learning_rate'], momentum=params['momentum'], weight_decay=params['weight_decay']) #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', min_lr=1e-5, patience=50, factor=0.1) #pretrain_model = pretrain_path.split('/')[-1].split('.')[0] + 'pth' model_save_dir = os.path.join(save_path, '_' + time.strftime('%m-%d-%H-%M')) writer = SummaryWriter(model_save_dir) for data in train_loader: clip, label = data writer.add_video('train/clips', clip, 0, fps=8) writer.add_text('train/idx', str(label.tolist()), 0) clip = clip.cuda() #writer.add_graph(model, (clip, clip)); break for name, param in model.named_parameters(): writer.add_histogram('params/{}'.format(name), param, 0) if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) prev_best_val_loss = float('inf') prev_best_loss_model_path = None prev_best_acc_model_path = None best_acc = 0 best_epoch = 0 for epoch in tqdm(range(start_epoch, start_epoch + params['epoch_num'])): train(train_loader, model, criterion, optimizer, epoch, writer) val_loss, top1_avg = validation(val_loader, model, criterion, optimizer, epoch) if top1_avg >= best_acc: best_acc = top1_avg best_epoch = epoch model_path = os.path.join( model_save_dir, 'best_acc_model_{}.pth.tar'.format(epoch)) torch.save(model.state_dict(), model_path) prev_best_acc_model_path = model_path if val_loss < prev_best_val_loss: model_path = os.path.join( model_save_dir, 'best_loss_model_{}.pth.tar'.format(epoch)) torch.save(model.state_dict(), model_path) prev_best_val_loss = val_loss prev_best_loss_model_path = model_path scheduler.step(val_loss) if epoch % 20 == 0: checkpoints = os.path.join(model_save_dir, str(epoch) + ".pth.tar") torch.save(model.state_dict(), checkpoints) print("save_to:", checkpoints) print("best is :", best_acc, best_epoch)
## PLAY GAME metrics['epsilon'] = eps.get(step) game = utils.play_game(env, agent = dqn_epsilon_agent, th = metrics['epsilon'], memory = memory) metrics['run_reward'], metrics['run_episode_steps'] = game['cum_reward'], game['steps'] step += metrics['run_episode_steps'] ## TRAIN for _ in range(metrics['run_episode_steps']//param['batch_size']): metrics['run_loss'] = train_batch(param) if metrics['episode'] % 500 == 0: target_dqn.load_state_dict(dqn.state_dict()) # Test agent: if metrics['episode'] % 100 == 0: game = utils.play_game(env, agent = dqn_epsilon_agent, th = 0.02, memory = memory) metrics['test_reward'], metrics['test_episode_steps'] = game['cum_reward'], game['steps'] checkpoint.save(dqn, step = step, step_loss = -metrics['test_reward']) # REPORTING if metrics['episode'] % 100 == 0: for key, val in metrics.items(): writer.add_scalar(key, val, global_step = step) # Animate agent: if metrics['episode'] % 2500 == 0: print("episode: {}, step: {}, reward: {}".format(metrics['episode'], step, metrics['run_reward'])) game = utils.play_game(env, agent = dqn_epsilon_agent, th = 0.02, render = True, memory = memory) writer.add_video("test_game", game['frames'], global_step = step)
clip_grad_value=10) print(model) for itr in tqdm(range(args.gradient_steps)): try: batch = next(train_loader_iterator) except StopIteration: train_loader_iterator = iter(train_loader) batch = next(train_loader_iterator) batch = batch.to(device) batch_size, seq_len, *_ = batch.size() batch = batch.view(batch_size, seq_len, -1) batch = batch.transpose(0, 1) loss = model.train({"x": batch}) writer.add_scalar('train_loss', loss, itr) with torch.no_grad(): if itr % log_interval_num == 0: test_pred = model.pred(test_batch) test_loss = model.test( {"x": batch.view(seq_len, batch_size, -1)}) writer.add_scalar('test_loss', test_loss, itr) writer.add_video('test_pred', test_pred.transpose(0, 1), itr) writer.add_video('test_ground_truth', test_batch.transpose(0, 1), itr) writer.close()
class Logger: def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): self._log_dir = log_dir print('########################') print('logging outputs to ', log_dir) print('########################') self._n_logged_samples = n_logged_samples self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) def log_scalar(self, scalar, name, step_): self._summ_writer.add_scalar('{}'.format(name), scalar, step_) def log_scalars(self, scalar_dict, group_name, step, phase): """Will log all scalars in the same plot.""" self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) def log_image(self, image, name, step): assert(len(image.shape) == 3) # [C, H, W] self._summ_writer.add_image('{}'.format(name), image, step) def log_video(self, video_frames, name, step, fps=10): assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): print('***************logging video********************') # reshape the rollouts videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] # max rollout length max_videos_to_save = np.min([max_videos_to_save, len(videos)]) max_length = videos[0].shape[0] for i in range(max_videos_to_save): if videos[i].shape[0]>max_length: max_length = videos[i].shape[0] # pad rollouts to all be same length for i in range(max_videos_to_save): if videos[i].shape[0]<max_length: padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1)) videos[i] = np.concatenate([videos[i], padding], 0) # log videos to tensorboard event file videos = np.stack(videos[:max_videos_to_save], 0) self.log_video(videos, video_title, step, fps=fps) def log_figures(self, figure, name, step, phase): """figure: matplotlib.pyplot figure handle""" assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!" self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) def log_figure(self, figure, name, step, phase): """figure: matplotlib.pyplot figure handle""" self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) def log_graph(self, array, name, step, phase): """figure: matplotlib.pyplot figure handle""" im = plot_graph(array) self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) def dump_scalars(self, log_path=None): log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path self._summ_writer.export_scalars_to_json(log_path) def flush(self): self._summ_writer.flush()
class Logger: def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): self._log_dir = log_dir print('########################') print('logging outputs to ', log_dir) print('########################') self._n_logged_samples = n_logged_samples self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) def log_scalar(self, scalar, name, step_): self._summ_writer.add_scalar('{}'.format(name), scalar, step_) def log_scalars(self, scalar_dict, group_name, step, phase): """Will log all scalars in the same plot.""" self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) def log_image(self, image, name, step): assert (len(image.shape) == 3) # [C, H, W] self._summ_writer.add_image('{}'.format(name), image, step) def log_video(self, video_frames, name, step, fps=10): assert len( video_frames.shape ) == 5, "Need [N, T, C, H, W] input tensor for video logging!" self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): # reshape the rollouts videos = [p['image_obs'] for p in paths] # max rollout length max_videos_to_save = np.min([max_videos_to_save, len(videos)]) max_length = videos[0].shape[0] for i in range(max_videos_to_save): if videos[i].shape[0] > max_length: max_length = videos[i].shape[0] # pad rollouts to all be same length for i in range(max_videos_to_save): if videos[i].shape[0] < max_length: padding = np.tile([videos[i][-1]], (max_length - videos[i].shape[0], 1, 1, 1)) videos[i] = np.concatenate([videos[i], padding], 0) clip = mpy.ImageSequenceClip(list(videos[i]), fps=fps) txt_clip = (mpy.TextClip(video_title, fontsize=30, color='white').set_position( 'top', 'center').set_duration(10)) video = mpy.CompositeVideoClip([clip, txt_clip]) new_video_title = video_title + '{}_{}'.format(step, i) + '.mp4' filename = os.path.join(self._log_dir, new_video_title) video.write_videofile(filename, fps=fps) def log_figures(self, figure, name, step, phase): """figure: matplotlib.pyplot figure handle""" assert figure.shape[ 0] > 0, "Figure logging requires input shape [batch x figures]!" self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) def log_figure(self, figure, name, step, phase): """figure: matplotlib.pyplot figure handle""" self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) def log_graph(self, array, name, step, phase): """figure: matplotlib.pyplot figure handle""" im = plot_graph(array) self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) def dump_scalars(self, log_path=None): log_path = os.path.join( self._log_dir, "scalar_data.json") if log_path is None else log_path self._summ_writer.export_scalars_to_json(log_path) def flush(self): self._summ_writer.flush()
elif args.dataset == 'K400': train_dataset = K400Dataset_train('data/K400', args.cl, args.split, True, train_transforms) val_dataset = K400Dataset_val('data/K400', args.cl, args.split, True, train_transforms) # split val for 800 videos #train_dataset, val_dataset = random_split(train_dataset, (len(train_dataset)-val_size, val_size)) print('TRAIN video number: {}, VAL video number: {}.'.format(len(train_dataset), len(val_dataset))) train_dataloader = DataLoader(train_dataset, batch_size=args.bs, shuffle=True, num_workers=args.workers, pin_memory=True) val_dataloader = DataLoader(val_dataset, batch_size=args.bs, shuffle=False, num_workers=args.workers, pin_memory=True) # save graph and clips_order samples for data in train_dataloader: clips, idxs = data writer.add_video('train/clips', clips, 0, fps=8) writer.add_text('train/idxs', str(idxs.tolist()), 0) clips = clips.to(device) #writer.add_graph(model, clips) break # save init params at step 0 for name, param in model.named_parameters(): writer.add_histogram('params/{}'.format(name), param, 0) ### loss funciton, optimizer and scheduler ### criterion = nn.CrossEntropyLoss() optimizer = optim.SGD([ {'params': [param for name, param in model.named_parameters() if 'linear' not in name and 'conv5' not in name and 'conv4' not in name]}, {'params': [param for name, param in model.named_parameters() if 'linear' in name or 'conv5' in name or 'conv4' in name], 'lr': args.ft_lr}], lr=args.lr, momentum=args.momentum, weight_decay=args.wd) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', min_lr=1e-5, patience=50, factor=0.1)
class Logger: def __init__(self, log_dir, n_logged_samples=3, summary_writer=None): self._log_dir = log_dir self._n_logged_samples = n_logged_samples if summary_writer is not None: self._summ_writer = summary_writer else: self._summ_writer = SummaryWriter(log_dir) def log_scalar(self, scalar, name, step, phase): self._summ_writer.add_scalar('{}_{}'.format(name, phase), scalar, step) def log_scalars(self, scalar_dict, group_name, step, phase): """Will log all scalars in the same plot.""" self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) def log_images(self, image, name, step, phase): image = self._format_input(image) self._check_size(image, 4) # [N, C, H, W] self._loop_batch(self._summ_writer.add_image, '{}_{}'.format(name, phase), image, step) def log_gif(self, gif_frames, name, step, phase): if isinstance(gif_frames, list): gif_frames = np.concatenate(gif_frames) gif_frames = self._format_input(gif_frames) assert len(gif_frames.shape) == 4, "Need [T, C, H, W] input tensor for single video logging!" gif_frames = gif_frames.unsqueeze(0) # add an extra dimension to get grid of size 1 self._summ_writer.add_video('{}_{}'.format(name, phase), gif_frames, step, fps=10) def log_graph(self, array, name, step, phase): """array gets plotted with plt.plot""" im = torch.tensor(plot_graph(array).transpose(2, 0, 1)) self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) def dump_scalars(self, log_path=None): log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path self._summ_writer.export_scalars_to_json(log_path) def _loop_batch(self, fn, name, val, *argv, **kwargs): """Loops the logging function n times.""" for log_idx in range(min(self._n_logged_samples, len(val))): name_i = os.path.join(name, "_%d" % log_idx) fn(name_i, val[log_idx], *argv, **kwargs) def visualize(self, *args, **kwargs): """Subclasses can implement this method to visualize training results.""" pass @staticmethod def _check_size(val, size): if isinstance(val, torch.Tensor) or isinstance(val, np.ndarray): assert len(val.shape) == size, "Size of tensor does not fit required size, {} vs {}".format(len(val.shape), size) elif isinstance(val, list): assert len(val[0].shape) == size - 1, "Size of list element does not fit required size, {} vs {}".format( len(val[0].shape), size - 1) else: raise NotImplementedError("Input type {} not supported for dimensionality check!".format(type(val))) if (val[0].shape[1] > 10000) or (val[0].shape[2] > 10000): print("Logging very large image with size {}px.".format(max(val[0].shape[1], val[0].shape[2]))) raise ValueError("This might be a bit too much") @staticmethod def _format_input(arr): if not isinstance(arr, torch.Tensor): arr = torch.tensor(arr) if not (arr.shape[1] == 3 or arr.shape[1] == 1): arr = arr.permute(0, 3, 1, 2) arr = arr.float() return arr def __del__(self): self._summ_writer.close() print("Closed summary writer.")