def load(self, f=None): if f is None: f = self.ckpt if isinstance(self.net, nn.DataParallel): self.net.module.load_state_dict( torch.load(f, map_location=self.device)) else: self.net.load_state_dict(torch.load(f, map_location=self.device)) if os.path.exists(f + ".config"): with open(f + ".config", "r") as fp: dic = json.loads(fp.read()) self.config = defaultdict(float, dic) print("Loaded,", self.config) if os.path.exists(f + ".optimizer"): self.optimizer.load_state_dict(torch.load(f + ".optimizer")) if os.path.exists(f + ".scheduler") and self.lr_scheduler is not None: self.lr_scheduler.load_state_dict(torch.load(f + ".scheduler")) if os.path.exists(f + ".csv"): self.dataframe = pd.read_csv(f + ".csv") if self.config["tensorboard_dir"] is not None: self.tensorboard = SummaryWriter(self.config["tensorboard_dir"]) else: self.tensorboard = None
def test_write_scalar(self): summary_writer = SummaryWriter(self._log_dir) tag_name = "learning_rate" learning_rate = torch.tensor(.01) for i in range(10): summary_writer.add_scalar(tag_name, learning_rate, i) learning_rate -= 0.005
def on_train_begin(self, logs=None): self.train_writer = SummaryWriter( os.path.join(self.logdir, "train"), purge_step=self.purge_step ) self.val_writer = SummaryWriter( os.path.join(self.logdir, "val"), purge_step=self.purge_step ) self.steps = self.params["steps"] self.global_step = 0
def __init__( self, net, criterion=None, metric=cal_accuracy, train_dataloader=None, val_dataloader=None, test_dataloader=None, optimizer=None, lr_scheduler=None, tensorboard_dir="./pinkblack_tb/", ckpt="./ckpt/ckpt.pth", experiment_id=None, clip_gradient_norm=False, is_data_dict=False, ): """ :param net: nn.Module Network :param criterion: loss function. __call__(prediction, *batch_y) :param metric: metric function __call__(prediction, *batch_y). *note* : bigger is better. (Early Stopping할 때 metric이 더 큰 값을 선택한다) :param train_dataloader: :param val_dataloader: :param test_dataloader: :param optimizer: torch.optim :param lr_scheduler: :param tensorboard_dir: tensorboard log :param ckpt: weight path :param experiment_id: be shown on tensorboard :param clip_gradient_norm: False or Scalar value (숫자를 입력하면 gradient clipping한다.) :param is_data_dict: whether dataloaders return dict. (dataloader에서 주는 데이터가 dict인지 - 아니라면 (x, y pair tuple로 주는 데이터이다.) """ self.net = net self.criterion = nn.CrossEntropyLoss( ) if criterion is None else criterion self.metric = metric self.dataloader = dict() if train_dataloader is not None: self.dataloader["train"] = train_dataloader if val_dataloader is not None: self.dataloader["val"] = val_dataloader if test_dataloader is not None: self.dataloader["test"] = test_dataloader if train_dataloader is None or val_dataloader is None: logging.warning("Init Trainer :: Two dataloaders are needed!") self.optimizer = (Adam( filter(lambda p: p.requires_grad, self.net.parameters())) if optimizer is None else optimizer) self.lr_scheduler = lr_scheduler self.ckpt = ckpt self.config = defaultdict(float) self.config["max_train_metric"] = -1e8 self.config["max_val_metric"] = -1e8 self.config["max_test_metric"] = -1e8 self.config["tensorboard_dir"] = tensorboard_dir self.config["timestamp"] = datetime.now().strftime("%Y%m%d_%H%M%S") self.config["clip_gradient_norm"] = clip_gradient_norm self.config["is_data_dict"] = is_data_dict if experiment_id is None: self.config["experiment_id"] = self.config["timestamp"] else: self.config["experiment_id"] = experiment_id self.dataframe = pd.DataFrame() self.device = Trainer.get_model_device(self.net) if self.device == torch.device("cpu"): logging.warning( "Init Trainer :: Do you really want to train the network on CPU instead of GPU?" ) if self.config["tensorboard_dir"] is not None: self.tensorboard = SummaryWriter(self.config["tensorboard_dir"]) else: self.tensorboard = None self.callbacks = defaultdict(list)
class TensorBoard(Callback): """Callback that streams epoch results to tensorboard events folder. Supports all values that can be represented as a string, including 1D iterables such as `np.ndarray`. Example: ```python tensorboard_logger = TensorBoard('runs') model.fit(X_train, Y_train, callbacks=[tensorboard_logger]) ``` """ def __init__(self, logdir: Optional[str] = None, update_freq: Union[str, int] = "epoch", **kwargs) -> None: """ Arguments: logdir: Save directory location. Default is runs/**CURRENT_DATETIME_HOSTNAME**, which changes after each run. Use hierarchical folder structure to compare between runs easily. e.g. pass in 'runs/exp1', 'runs/exp2', etc. for each new experiment to compare across them. update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`, writes the losses and metrics to TensorBoard after each batch. The same applies for `'epoch'`. **kwargs: Options to pass to `SummaryWriter` object """ self.logdir = logdir self.writer = None self.keys = None if update_freq == "batch": self.update_freq = 1 else: self.update_freq = update_freq self._open_args = kwargs if kwargs else {} super(TensorBoard, self).__init__() def on_train_begin(self, logs=None): self.writer = SummaryWriter(self.logdir, **self._open_args) def on_train_batch_end(self, batch: int, logs): if self.update_freq == "epoch": return logs = logs or {} def handle_value(k): is_zero_dim_ndarray = isinstance(k, np.ndarray) and k.ndim == 0 if isinstance(k, six.string_types): return k elif isinstance(k, tp.Iterable) and not is_zero_dim_ndarray: return '"[%s]"' % (", ".join(map(str, k))) else: return k if self.update_freq != "epoch" and batch % self.update_freq == 0: if self.keys is None: self.keys = sorted(logs.keys()) row_dict = collections.OrderedDict({"batch": batch}) row_dict.update( (key + "batch", handle_value(logs[key])) for key in self.keys) for key, value in row_dict.items(): self.writer.add_scalar(key, value, batch) def on_epoch_end(self, epoch, logs=None): logs = logs or {} def handle_value(k): is_zero_dim_ndarray = isinstance(k, np.ndarray) and k.ndim == 0 if isinstance(k, six.string_types): return k elif isinstance(k, tp.Iterable) and not is_zero_dim_ndarray: return '"[%s]"' % (", ".join(map(str, k))) else: return k if self.keys is None: self.keys = sorted(logs.keys()) row_dict = collections.OrderedDict({"epoch": epoch}) row_dict.update((key, handle_value(logs[key])) for key in self.keys) for key, value in row_dict.items(): self.writer.add_scalar(key, value, epoch) def on_train_end(self, logs=None): self.writer.close()
def main( steps_per_epoch: int = 200, epochs: int = 50, debug: bool = False, eager: bool = False, logdir: str = "runs", ): if debug: import debugpy print("Waiting for debugger...") debugpy.listen(5678) debugpy.wait_for_client() current_time = datetime.now().strftime("%b%d_%H-%M-%S") logdir = os.path.join(logdir, current_time) X_train, _1, X_test, _2 = dataget.image.mnist(global_cache=True).get() # Now binarize data X_train = (X_train > 0).astype(jnp.float32) X_test = (X_test > 0).astype(jnp.float32) print("X_train:", X_train.shape, X_train.dtype) print("X_test:", X_test.shape, X_test.dtype) vae = VAE(latent_size=LATENT_SIZE) # model = VariationalAutoEncoder(latent_size=LATENT_SIZE, optimizer=optax.adam(1e-3)) def loss(x, y_pred): logits, mean, stddev = y_pred ce_loss = elegy.losses.binary_crossentropy(x, logits, from_logits=True).mean() kl_loss = 2e-1 * kl_divergence(mean, stddev) return ce_loss + kl_loss model = elegy.Model( module=vae, loss=loss, optimizer=optax.adam(1e-3), run_eagerly=eager, ) # Fit with datasets in memory history = model.fit( x=X_train, epochs=epochs, batch_size=64, steps_per_epoch=steps_per_epoch, validation_data=(X_test, ), shuffle=True, callbacks=[TensorBoard(logdir)], ) print( "\n\n\nMetrics and images can be explored using tensorboard using:", f"\n \t\t\t tensorboard --logdir {logdir}", ) elegy.utils.plot_history(history) # get random samples idxs = np.random.randint(0, len(X_test), size=(5, )) x_sample = X_test[idxs] # get predictions logits, mean, stddev = model.predict(x=x_sample) y_pred = jax.nn.sigmoid(logits) # plot and save results with SummaryWriter(os.path.join(logdir, "val")) as tbwriter: figure = plt.figure(figsize=(12, 12)) for i in range(5): plt.subplot(2, 5, i + 1) plt.imshow(x_sample[i], cmap="gray") plt.subplot(2, 5, 5 + i + 1) plt.imshow(y_pred[i], cmap="gray") # # tbwriter.add_figure("VAE Example", figure, epochs) plt.show()
class Trainer: experiment_name = None def __init__( self, net, criterion=None, metric=cal_accuracy, train_dataloader=None, val_dataloader=None, test_dataloader=None, optimizer=None, lr_scheduler=None, tensorboard_dir="./pinkblack_tb/", ckpt="./ckpt/ckpt.pth", experiment_id=None, clip_gradient_norm=False, is_data_dict=False, ): """ :param net: nn.Module Network :param criterion: loss function. __call__(prediction, *batch_y) :param metric: metric function __call__(prediction, *batch_y). *note* : bigger is better. (Early Stopping할 때 metric이 더 큰 값을 선택한다) :param train_dataloader: :param val_dataloader: :param test_dataloader: :param optimizer: torch.optim :param lr_scheduler: :param tensorboard_dir: tensorboard log :param ckpt: weight path :param experiment_id: be shown on tensorboard :param clip_gradient_norm: False or Scalar value (숫자를 입력하면 gradient clipping한다.) :param is_data_dict: whether dataloaders return dict. (dataloader에서 주는 데이터가 dict인지 - 아니라면 (x, y pair tuple로 주는 데이터이다.) """ self.net = net self.criterion = nn.CrossEntropyLoss( ) if criterion is None else criterion self.metric = metric self.dataloader = dict() if train_dataloader is not None: self.dataloader["train"] = train_dataloader if val_dataloader is not None: self.dataloader["val"] = val_dataloader if test_dataloader is not None: self.dataloader["test"] = test_dataloader if train_dataloader is None or val_dataloader is None: logging.warning("Init Trainer :: Two dataloaders are needed!") self.optimizer = (Adam( filter(lambda p: p.requires_grad, self.net.parameters())) if optimizer is None else optimizer) self.lr_scheduler = lr_scheduler self.ckpt = ckpt self.config = defaultdict(float) self.config["max_train_metric"] = -1e8 self.config["max_val_metric"] = -1e8 self.config["max_test_metric"] = -1e8 self.config["tensorboard_dir"] = tensorboard_dir self.config["timestamp"] = datetime.now().strftime("%Y%m%d_%H%M%S") self.config["clip_gradient_norm"] = clip_gradient_norm self.config["is_data_dict"] = is_data_dict if experiment_id is None: self.config["experiment_id"] = self.config["timestamp"] else: self.config["experiment_id"] = experiment_id self.dataframe = pd.DataFrame() self.device = Trainer.get_model_device(self.net) if self.device == torch.device("cpu"): logging.warning( "Init Trainer :: Do you really want to train the network on CPU instead of GPU?" ) if self.config["tensorboard_dir"] is not None: self.tensorboard = SummaryWriter(self.config["tensorboard_dir"]) else: self.tensorboard = None self.callbacks = defaultdict(list) def register_callback(self, func, phase="val"): self.callbacks[phase].append(func) def save(self, f=None): if f is None: f = self.ckpt os.makedirs(os.path.dirname(f), exist_ok=True) if isinstance(self.net, nn.DataParallel): state_dict = self.net.module.state_dict() else: state_dict = self.net.state_dict() torch.save(state_dict, f) torch.save(self.optimizer.state_dict(), f + ".optimizer") if self.lr_scheduler is not None: torch.save(self.lr_scheduler.state_dict(), f + ".scheduler") with open(f + ".config", "w") as fp: json.dump(self.config, fp) self.dataframe.to_csv(f + ".csv", float_format="%.6f", index=False) def load(self, f=None): if f is None: f = self.ckpt if isinstance(self.net, nn.DataParallel): self.net.module.load_state_dict( torch.load(f, map_location=self.device)) else: self.net.load_state_dict(torch.load(f, map_location=self.device)) if os.path.exists(f + ".config"): with open(f + ".config", "r") as fp: dic = json.loads(fp.read()) self.config = defaultdict(float, dic) print("Loaded,", self.config) if os.path.exists(f + ".optimizer"): self.optimizer.load_state_dict(torch.load(f + ".optimizer")) if os.path.exists(f + ".scheduler") and self.lr_scheduler is not None: self.lr_scheduler.load_state_dict(torch.load(f + ".scheduler")) if os.path.exists(f + ".csv"): self.dataframe = pd.read_csv(f + ".csv") if self.config["tensorboard_dir"] is not None: self.tensorboard = SummaryWriter(self.config["tensorboard_dir"]) else: self.tensorboard = None def train(self, epoch=None, phases=None, step=None, validation_interval=1, save_every_validation=False): """ :param epoch: train dataloader를 순회할 횟수 :param phases: ['train', 'val', 'test'] 중 필요하지 않은 phase를 뺄 수 있다. >> trainer.train(1, phases=['val']) :param step: epoch이 아닌 step을 훈련단위로 할 때의 총 step 수. :param validation_interval: validation 간격 :param save_every_validation: True이면, validation마다 checkpoint를 저장한다. :return: None """ if phases is None: phases = list(self.dataloader.keys()) if epoch is None and step is None: raise ValueError( "PinkBlack.trainer :: epoch or step should be specified.") train_unit = "epoch" if step is None else "step" self.config[train_unit] = int(self.config[train_unit]) num_unit = epoch if step is None else step validation_interval = 1 if validation_interval <= 0 else validation_interval kwarg_list = [train_unit] for phase in phases: kwarg_list += [f"{phase}_loss", f"{phase}_metric"] kwarg_list += ["lr", "time"] print_row(kwarg_list=[""] * len(kwarg_list), pad="-") print_row(kwarg_list=kwarg_list, pad=" ") print_row(kwarg_list=[""] * len(kwarg_list), pad="-") start = self.config[train_unit] for i in range(start, start + num_unit, validation_interval): start_time = time() if train_unit == "epoch": for phase in phases: self.config[f"{phase}_loss"], self.config[ f"{phase}_metric"] = self._train( phase, num_steps=len(self.dataloader[phase])) for func in self.callbacks[phase]: func() self.config[train_unit] += 1 elif train_unit == "step": for phase in phases: if phase == "train": # num_unit 이 validation interval로 나눠떨어지지 않는 경우 num_steps = min((start + num_unit - i), validation_interval) self.config[train_unit] += num_steps else: num_steps = len(self.dataloader[phase]) self.config[f"{phase}_loss"], self.config[ f"{phase}_metric"] = self._train(phase, num_steps=num_steps) for func in self.callbacks[phase]: func() else: raise NotImplementedError if self.lr_scheduler is not None: if isinstance(self.lr_scheduler, ReduceLROnPlateau): self.lr_scheduler.step(self.config["val_metric"]) else: self.lr_scheduler.step() i_str = str(self.config[train_unit]) is_best = self.config["max_val_metric"] < self.config["val_metric"] if is_best: for phase in phases: self.config[f"max_{phase}_metric"] = max( self.config[f"max_{phase}_metric"], self.config[f"{phase}_metric"]) i_str = (str(self.config[train_unit])) + "-best" elapsed_time = time() - start_time if self.tensorboard is not None: _loss, _metric = {}, {} for phase in phases: _loss[phase] = self.config[f"{phase}_loss"] _metric[phase] = self.config[f"{phase}_metric"] self.tensorboard.add_scalars( f"{self.config['experiment_id']}/loss", _loss, self.config[train_unit]) self.tensorboard.add_scalars( f"{self.config['experiment_id']}/metric", _metric, self.config[train_unit]) self.tensorboard.add_scalar( f"{self.config['experiment_id']}/time", elapsed_time, self.config[train_unit]) self.tensorboard.add_scalar( f"{self.config['experiment_id']}/lr", self.optimizer.param_groups[0]["lr"], self.config[train_unit], ) print_kwarg = [i_str] for phase in phases: print_kwarg += [ self.config[f"{phase}_loss"], self.config[f"{phase}_metric"] ] print_kwarg += [self.optimizer.param_groups[0]["lr"], elapsed_time] print_row(kwarg_list=print_kwarg, pad=" ") print_row(kwarg_list=[""] * len(kwarg_list), pad="-") self.dataframe = self.dataframe.append(dict( zip(kwarg_list, print_kwarg)), ignore_index=True) if is_best: self.save(self.ckpt) if Trainer.experiment_name is not None: self.update_experiment() if save_every_validation: self.save(self.ckpt + f"-{self.config[train_unit]}") def _step(self, phase, iterator, only_inference=False): if self.config["is_data_dict"]: batch_dict = next(iterator) batch_size = batch_dict[list(batch_dict.keys())[0]].size(0) for k, v in batch_dict.items(): batch_dict[k] = v.to(self.device) else: batch_x, batch_y = next(iterator) if isinstance(batch_x, list): batch_x = [x.to(self.device) for x in batch_x] else: batch_x = [batch_x.to(self.device)] if isinstance(batch_y, list): batch_y = [y.to(self.device) for y in batch_y] else: batch_y = [batch_y.to(self.device)] batch_size = batch_x[0].size(0) self.optimizer.zero_grad() with torch.set_grad_enabled(phase == "train"): if self.config["is_data_dict"]: outputs = self.net(batch_dict) if not only_inference: loss = self.criterion(outputs, batch_dict) else: outputs = self.net(*batch_x) if not only_inference: loss = self.criterion(outputs, *batch_y) if only_inference: return outputs if phase == "train": loss.backward() if self.config["clip_gradient_norm"]: clip_grad_norm_(self.net.parameters(), self.config["clip_gradient_norm"]) self.optimizer.step() with torch.no_grad(): if self.config["is_data_dict"]: metric = self.metric(outputs, batch_dict) else: metric = self.metric(outputs, *batch_y) return { "loss": loss.item(), "batch_size": batch_size, "metric": metric.item() } def _train(self, phase, num_steps=0): running_loss = AverageMeter() running_metric = AverageMeter() if phase == "train": self.net.train() else: self.net.eval() dataloader = self.dataloader[phase] step_iterator = iter(dataloader) tq = tqdm(range(num_steps), leave=False) for st in tq: if (st + 1) % len(dataloader) == 0: step_iterator = iter(dataloader) results = self._step(phase=phase, iterator=step_iterator) tq.set_description( f"Loss:{results['loss']:.4f}, Metric:{results['metric']:.4f}") running_loss.update(results["loss"], results["batch_size"]) running_metric.update(results["metric"], results["batch_size"]) return running_loss.avg, running_metric.avg def eval(self, dataloader=None): self.net.eval() if dataloader is None: dataloader = self.dataloader["val"] phase = "val" output_list = [] step_iterator = iter(dataloader) num_steps = len(dataloader) for st in tqdm(range(num_steps), leave=False): results = self._step(phase="val", iterator=step_iterator, only_inference=True) output_list.append(results) output_cat = torch.cat(output_list) return output_cat def add_external_config(self, args): """ args : a dict-like object which contains key-value configurations. """ new_d = defaultdict(float) for k, v in args.items(): new_d[f"config_{k}"] = v self.config.update(new_d) def update_experiment(self): """ Update experiment statistics by its name (csv file). """ assert Trainer.experiment_name is not None df_config = pd.DataFrame(pd.Series( self.config)).T.set_index("experiment_id") if os.path.exists(Trainer.experiment_name + ".csv"): df_ex = pd.read_csv(Trainer.experiment_name + ".csv", index_col=0) if self.config["experiment_id"] in df_ex.index: df_ex = df_ex.drop(self.config["experiment_id"]) df_ex = df_ex.append(df_config, sort=False) else: df_ex = df_config df_ex.to_csv(Trainer.experiment_name + ".csv") return df_ex def swa_apply(self, bn_update=True): assert hasattr(self.optimizer, "swap_swa_sgd") self.optimizer.swap_swa_sgd() if bn_update: self.swa_bn_update() def swa_bn_update(self): r"""Updates BatchNorm running_mean, running_var buffers in the model. It performs one pass over data in `loader` to estimate the activation statistics for BatchNorm layers in the model. original source is from : torchcontrib """ if not check_bn(self.net): return was_training = self.net.training self.net.train() momenta = {} self.net.apply(reset_bn) self.net.apply(lambda module: get_momenta(module, momenta)) n = 0 for input in self.dataloader['train']: if isinstance(input, (list, tuple)): input = input[0] b = input.size(0) input = input.to(self.device) elif self.config['is_data_dict']: b = input[list(input.keys())[0]].size(0) for k, v in input.items(): input[k] = v.to(self.device) else: b = input.size(0) input = input.to(self.device) momentum = b / float(n + b) for module in momenta.keys(): module.momentum = momentum self.net(input) n += b self.net.apply(lambda module: set_momenta(module, momenta)) self.net.train(was_training) @staticmethod def get_model_device(net): device = torch.device("cpu") for param in net.parameters(): device = param.device break return device @staticmethod def set_experiment_name(name): Trainer.experiment_name = name
def train(agents, params, num_processes): """Training Loop for value-based RL methods. Params ====== agent (object) --- the agent to train params (dict) --- the dictionary of parameters """ n_episodes = params['episodes'] maxlen = params['maxlen'] name = params['agent_params']['name'] brain_name = params['brain_name'] env = params['environment'] add_noise = params['agent_params']['add_noise'] pretrain = params['pretrain'] pretrain_length = params['pretrain_length'] num_agents = num_processes scores = np.zeros(num_agents) # list containing scores from each episode scores_window = deque(maxlen=maxlen) # last N scores scores_episode = [] writer = SummaryWriter(log_dir=params['log_dir'] + name) env_info = env.reset(train_mode=True)[brain_name] tic = time.time() timesteps = 0 achievement_length = 0 episode_start = 1 if params['load_agent']: episode_start, timesteps = agents.load_agent() for i_episode in range(episode_start, n_episodes + 1): tic = time.time() states = env_info.vector_observations scores = np.zeros(num_agents) env.reset() while True: states = torch.tensor(states) if pretrain and pretrain_length < len(agents.memory.memory): pretrain = False actions, noise_epsilon = agents.act(states, add_noise, pretrain=pretrain) env_info = env.step(actions)[ brain_name] # send the action to the environment next_states = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done # see if episode has finished adjusted_rewards = np.array(env_info.rewards) if params['hack_rewards']: if adjusted_rewards[0] != 0: adjusted_rewards[1] = adjusted_rewards[0] * params[ 'alternative_reward_scalar'] elif adjusted_rewards[1] != 0: adjusted_rewards[0] = adjusted_rewards[1] * params[ 'alternative_reward_scalar'] actor_loss, critic_loss = agents.step(states, actions, adjusted_rewards, next_states, dones, pretrain=pretrain) if actor_loss != None and critic_loss != None: if params['agent_params']['schedule_lr']: actor_lr, critic_lr = agents.get_lr() else: actor_lr, critic_lr = params['agent_params'][ 'actor_params']['lr'], params['agent_params'][ 'critic_params']['lr'] writer.add_scalar('noise_epsilon', noise_epsilon, timesteps) writer.add_scalar('actor_loss', actor_loss, timesteps) writer.add_scalar('critic_loss', critic_loss, timesteps) writer.add_scalar('actor_lr', actor_lr, timesteps) writer.add_scalar('critic_lr', critic_lr, timesteps) print('\rTimestep {}\tMax: {:.2f}'.format(timesteps, np.max(scores)), end="") scores += rewards # update the scores states = next_states # roll over the state to next time step if np.any(dones): # exit loop if episode finished break timesteps += 1 # Fills the buffer with experiences resulting from random actions # to encourage exploration if timesteps % params['random_fill_every'] == 0: pretrain = True pretrain = params['pretrain_length'] score = np.mean(scores) scores_episode.append(score) scores_window.append(score) # save most recent score print('\rEpisode {}\tMax: {:.2f} \t Time: {:.2f}'.format( i_episode, np.max(scores), time.time() - tic), end="\n") if i_episode % params['save_every'] == 0: agents.save_agent(np.mean(scores_window), i_episode, timesteps, save_history=True) else: agents.save_agent(np.mean(scores_window), i_episode, timesteps, save_history=False) writer.add_scalars('scores', { 'mean': np.mean(scores), 'min': np.min(scores), 'max': np.max(scores) }, timesteps) update_csv(name, i_episode, np.mean(scores), np.mean(scores)) agents.step_lr(np.mean(scores)) if np.mean(scores) > params['achievement']: achievement_length += 1 if achievement_length > params['achievement_length']: toc = time.time() print( "\n\n Congratulations! The agent has managed to solve the environment in {} episodes with {} training time\n\n" .format(i_episode, toc - tic)) writer.close() return scores else: achievement_length = 0 writer.close() return scores
def train(self, load_model=False, model_path=None): if load_model: if model_path is not None: self.load_weights(model_path) ## Training utterances all_input_ids, all_input_len, all_label_ids = convert_examples_to_features( self.train_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) num_train_batches = all_input_ids.size(0) num_train_steps = int(num_train_batches / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) logger.info("***** training *****") logger.info(" Num examples = %d", len(self.train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids, all_input_len, all_label_ids = all_input_ids.to( DEVICE), all_input_len.to(DEVICE), all_label_ids.to(DEVICE) train_data = TensorDataset(all_input_ids, all_input_len, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) all_input_ids_dev, all_input_len_dev, all_label_ids_dev = convert_examples_to_features( self.dev_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length) logger.info("***** validation *****") logger.info(" Num examples = %d", len(self.dev_examples)) logger.info(" Batch size = %d", args.dev_batch_size) all_input_ids_dev, all_input_len_dev, all_label_ids_dev = \ all_input_ids_dev.to(DEVICE), all_input_len_dev.to(DEVICE), all_label_ids_dev.to(DEVICE) dev_data = TensorDataset(all_input_ids_dev, all_input_len_dev, all_label_ids_dev) dev_sampler = SequentialSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.dev_batch_size) logger.info("Loaded data!") if args.fp16: self.sumbt_model.half() self.sumbt_model.to(DEVICE) ## Get domain-slot-type embeddings slot_token_ids, slot_len = \ get_label_embedding(self.processor.target_slot, args.max_label_length, self.tokenizer, DEVICE) # for slot_idx, slot_str in zip(slot_token_ids, self.processor.target_slot): # self.idx2slot[slot_idx] = slot_str ## Get slot-value embeddings label_token_ids, label_len = [], [] for slot_idx, labels in zip(slot_token_ids, self.label_list): # self.idx2value[slot_idx] = {} token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, DEVICE) label_token_ids.append(token_ids) label_len.append(lens) # for label, token_id in zip(labels, token_ids): # self.idx2value[slot_idx][token_id] = label logger.info('embeddings prepared') if USE_CUDA and N_GPU > 1: self.sumbt_model.module.initialize_slot_value_lookup( label_token_ids, slot_token_ids) else: self.sumbt_model.initialize_slot_value_lookup( label_token_ids, slot_token_ids) def get_optimizer_grouped_parameters(model): param_optimizer = [(n, p) for n, p in model.named_parameters() if p.requires_grad] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01, 'lr': args.learning_rate }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.learning_rate }, ] return optimizer_grouped_parameters if not USE_CUDA or N_GPU == 1: optimizer_grouped_parameters = get_optimizer_grouped_parameters( self.sumbt_model) else: optimizer_grouped_parameters = get_optimizer_grouped_parameters( self.sumbt_model.module) t_total = num_train_steps scheduler = None if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.fp16_loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer( optimizer, static_loss_scale=args.fp16_loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_proportion * t_total, num_training_steps=t_total) logger.info(optimizer) # Training code ############################################################################### logger.info("Training...") global_step = 0 last_update = None best_loss = None model = self.sumbt_model if not args.do_not_use_tensorboard: summary_writer = None else: summary_writer = SummaryWriter("./tensorboard_summary/logs_1214/") for epoch in trange(int(args.num_train_epochs), desc="Epoch"): # Train model.train() tr_loss = 0 nb_tr_examples = 0 nb_tr_steps = 0 for step, batch in enumerate(tqdm(train_dataloader)): batch = tuple(t.to(DEVICE) for t in batch) input_ids, input_len, label_ids = batch # Forward if N_GPU == 1: loss, loss_slot, acc, acc_slot, _ = model( input_ids, input_len, label_ids, N_GPU) else: loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU) # average to multi-gpus loss = loss.mean() acc = acc.mean() acc_slot = acc_slot.mean(0) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps # Backward if args.fp16: optimizer.backward(loss) else: loss.backward() # tensrboard logging if summary_writer is not None: summary_writer.add_scalar("Epoch", epoch, global_step) summary_writer.add_scalar("Train/Loss", loss, global_step) summary_writer.add_scalar("Train/JointAcc", acc, global_step) if N_GPU == 1: for i, slot in enumerate(self.processor.target_slot): summary_writer.add_scalar( "Train/Loss_%s" % slot.replace(' ', '_'), loss_slot[i], global_step) summary_writer.add_scalar( "Train/Acc_%s" % slot.replace(' ', '_'), acc_slot[i], global_step) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify lealrning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) if summary_writer is not None: summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step if scheduler is not None: torch.nn.utils.clip_grad_norm_( optimizer_grouped_parameters, 1.0) optimizer.step() if scheduler is not None: scheduler.step() optimizer.zero_grad() global_step += 1 # Perform evaluation on validation dataset model.eval() dev_loss = 0 dev_acc = 0 dev_loss_slot, dev_acc_slot = None, None nb_dev_examples, nb_dev_steps = 0, 0 for step, batch in enumerate( tqdm(dev_dataloader, desc="Validation")): batch = tuple(t.to(DEVICE) for t in batch) input_ids, input_len, label_ids = batch if input_ids.dim() == 2: input_ids = input_ids.unsqueeze(0) input_len = input_len.unsqueeze(0) label_ids = label_ids.unsuqeeze(0) with torch.no_grad(): if N_GPU == 1: loss, loss_slot, acc, acc_slot, _ = model( input_ids, input_len, label_ids, N_GPU) else: loss, _, acc, acc_slot, _ = model( input_ids, input_len, label_ids, N_GPU) # average to multi-gpus loss = loss.mean() acc = acc.mean() acc_slot = acc_slot.mean(0) num_valid_turn = torch.sum(label_ids[:, :, 0].view(-1) > -1, 0).item() dev_loss += loss.item() * num_valid_turn dev_acc += acc.item() * num_valid_turn if N_GPU == 1: if dev_loss_slot is None: dev_loss_slot = [l * num_valid_turn for l in loss_slot] dev_acc_slot = acc_slot * num_valid_turn else: for i, l in enumerate(loss_slot): dev_loss_slot[ i] = dev_loss_slot[i] + l * num_valid_turn dev_acc_slot += acc_slot * num_valid_turn nb_dev_examples += num_valid_turn dev_loss = dev_loss / nb_dev_examples dev_acc = dev_acc / nb_dev_examples if N_GPU == 1: dev_acc_slot = dev_acc_slot / nb_dev_examples # tensorboard logging if summary_writer is not None: summary_writer.add_scalar("Validate/Loss", dev_loss, global_step) summary_writer.add_scalar("Validate/Acc", dev_acc, global_step) if N_GPU == 1: for i, slot in enumerate(self.processor.target_slot): summary_writer.add_scalar( "Validate/Loss_%s" % slot.replace(' ', '_'), dev_loss_slot[i] / nb_dev_examples, global_step) summary_writer.add_scalar( "Validate/Acc_%s" % slot.replace(' ', '_'), dev_acc_slot[i], global_step) dev_loss = round(dev_loss, 6) output_model_file = os.path.join( os.path.join(SUMBT_PATH, args.output_dir), "pytorch_model.bin") if last_update is None or dev_loss < best_loss: if not USE_CUDA or N_GPU == 1: torch.save(model.state_dict(), output_model_file) else: torch.save(model.module.state_dict(), output_model_file) last_update = epoch best_loss = dev_loss best_acc = dev_acc logger.info( "*** Model Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % (last_update, best_loss, best_acc, global_step)) else: logger.info( "*** Model NOT Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % (epoch, dev_loss, dev_acc, global_step)) if last_update + args.patience <= epoch: break
d_iter_count += 1 if d_iter_count == config.max_d_iters: d_iter_count = 0 mode = 'G' iter_no = 0 max_iters = 100 mode = 'G' g_iter_count = 0 d_iter_count = 0 batch_size = config.batch_size train_writer = SummaryWriter(log_dir='../logs/train') val_writer = SummaryWriter(log_dir='../logs/val') with tqdm(total=max_iters) as pbar: for iter_no in range(max_iters): train_batch = train_loader.next_batch() gan.train() train_step(iter_no, train_batch) if iter_no % config.validation_interval == 0: val_batch = val_loader.next_batch() gan.eval() validate(val_batch, iter_no) pbar.update(1)
def main( debug: bool = False, eager: bool = False, logdir: str = "runs", steps_per_epoch: int = 200, epochs: int = 100, batch_size: int = 64, ): if debug: import debugpy print("Waiting for debugger...") debugpy.listen(5678) debugpy.wait_for_client() current_time = datetime.now().strftime("%b%d_%H-%M-%S") logdir = os.path.join(logdir, current_time) train_dataset = MNIST(training=True) test_dataset = MNIST(training=False) train_loader = eg.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_loader = eg.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True) print("X_train:", train_dataset.x.shape, train_dataset.x.dtype) print("y_train:", train_dataset.y.shape, train_dataset.y.dtype) print("X_test:", test_dataset.x.shape, test_dataset.x.dtype) print("y_test:", test_dataset.y.shape, test_dataset.y.dtype) @dataclass(unsafe_hash=True, repr=False) class MLP(eg.Module): """Standard LeNet-300-100 MLP network.""" n1: int = 300 n2: int = 100 @eg.compact def __call__(self, x: jnp.ndarray): x = x.astype(jnp.float32) / 255.0 x = eg.Flatten()(x) x = eg.Linear(self.n1)(x) x = jax.nn.relu(x) x = eg.Linear(self.n2)(x) x = jax.nn.relu(x) x = eg.Linear(10)(x) return x model = eg.Model( module=MLP(n1=300, n2=100), loss=[ eg.losses.Crossentropy(), eg.regularizers.L2(l=1e-4), ], metrics=eg.metrics.Accuracy(), optimizer=optax.adamw(1e-3), eager=eager, ) x_sample, y_sample = next(iter(train_loader)) model.summary(x_sample) history = model.fit( inputs=train_loader, epochs=epochs, steps_per_epoch=steps_per_epoch, validation_data=test_loader, shuffle=True, callbacks=[eg.callbacks.TensorBoard(logdir=logdir)], ) eg.utils.plot_history(history) # get random samples idxs = np.random.randint(0, 10000, size=(9, )) x_sample, y_sample = next(iter(test_loader)) # get predictions y_pred = model.predict(x=x_sample) # plot and save results def make_plot(): plt.figure(figsize=(12, 12)) for i in range(3): for j in range(3): k = 3 * i + j plt.subplot(3, 3, k + 1) plt.title(f"{np.argmax(y_pred[k])}") plt.imshow(x_sample[k], cmap="gray") with SummaryWriter(os.path.join(logdir, "val")) as tbwriter: make_plot() # tbwriter.add_figure("Predictions", plt.gcf(), 100) make_plot() plt.show() print( "\n\n\nMetrics and images can be explored using tensorboard using:", f"\n \t\t\t tensorboard --logdir {logdir}", )
def main(debug: bool = False, eager: bool = False, logdir: str = "runs"): if debug: import debugpy print("Waiting for debugger...") debugpy.listen(5678) debugpy.wait_for_client() current_time = datetime.now().strftime("%b%d_%H-%M-%S") logdir = os.path.join(logdir, current_time) X_train, y_train, X_test, y_test = dataget.image.mnist( global_cache=True).get() print("X_train:", X_train.shape, X_train.dtype) print("y_train:", y_train.shape, y_train.dtype) print("X_test:", X_test.shape, X_test.dtype) print("y_test:", y_test.shape, y_test.dtype) class Lambda(elegy.Module): def __init__(self, f): super().__init__() self.f = f def call(self, x): return self.f(x) class MLP(elegy.Module): """Standard LeNet-300-100 MLP network.""" def __init__(self, n1: int = 300, n2: int = 100, **kwargs): super().__init__(**kwargs) self.n1 = n1 self.n2 = n2 def call(self, image: jnp.ndarray): image = image.astype(jnp.float32) / 255.0 mlp = elegy.nn.sequential( elegy.nn.Flatten(), elegy.nn.Linear(self.n1), jax.nn.relu, elegy.nn.Linear(self.n2), jax.nn.relu, elegy.nn.Linear(10), ) return mlp(image) model = elegy.Model( module=MLP(n1=300, n2=100), loss=[ elegy.losses.SparseCategoricalCrossentropy(from_logits=True), elegy.regularizers.GlobalL2(l=1e-4), ], metrics=elegy.metrics.SparseCategoricalAccuracy(), optimizer=optax.adamw(1e-3), run_eagerly=eager, ) model.summary(X_train[:64]) history = model.fit( x=X_train, y=y_train, epochs=100, steps_per_epoch=200, batch_size=64, validation_data=(X_test, y_test), shuffle=True, callbacks=[elegy.callbacks.TensorBoard(logdir=logdir)], ) print(model.module.submodules) plot_history(history) # get random samples idxs = np.random.randint(0, 10000, size=(9, )) x_sample = X_test[idxs] # get predictions y_pred = model.predict(x=x_sample) # plot and save results with SummaryWriter(os.path.join(logdir, "val")) as tbwriter: figure = plt.figure(figsize=(12, 12)) for i in range(3): for j in range(3): k = 3 * i + j plt.subplot(3, 3, k + 1) plt.title(f"{np.argmax(y_pred[k])}") plt.imshow(x_sample[k], cmap="gray") tbwriter.add_figure("Predictions", figure, 100) plt.show() print( "\n\n\nMetrics and images can be explored using tensorboard using:", f"\n \t\t\t tensorboard --logdir {logdir}", )
def main( steps_per_epoch: int = 200, batch_size: int = 64, epochs: int = 50, debug: bool = False, eager: bool = False, logdir: str = "runs", ): if debug: import debugpy print("Waiting for debugger...") debugpy.listen(5678) debugpy.wait_for_client() current_time = datetime.now().strftime("%b%d_%H-%M-%S") logdir = os.path.join(logdir, current_time) dataset = load_dataset("mnist") X_train = np.array(np.stack(dataset["train"]["image"]), dtype=np.uint8) X_test = np.array(np.stack(dataset["test"]["image"]), dtype=np.uint8) # Now binarize data X_train = (X_train > 0).astype(jnp.float32) X_test = (X_test > 0).astype(jnp.float32) print("X_train:", X_train.shape, X_train.dtype) print("X_test:", X_test.shape, X_test.dtype) model = eg.Model( module=VAE(latent_size=LATENT_SIZE), loss=[ BinaryCrossEntropy(on="logits"), KL(weight=0.1), ], optimizer=optax.adam(1e-3), eager=eager, ) model.summary(X_train[:batch_size]) # Fit with datasets in memory history = model.fit( inputs=X_train, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps_per_epoch, validation_data=(X_test, ), shuffle=True, callbacks=[eg.callbacks.TensorBoard(logdir)], ) print( "\n\n\nMetrics and images can be explored using tensorboard using:", f"\n \t\t\t tensorboard --logdir {logdir}", ) eg.utils.plot_history(history) # get random samples idxs = np.random.randint(0, len(X_test), size=(5, )) x_sample = X_test[idxs] # get predictions preds = model.predict(x=x_sample) y_pred = jax.nn.sigmoid(preds["logits"]) # plot and save results with SummaryWriter(os.path.join(logdir, "val")) as tbwriter: figure = plt.figure(figsize=(12, 12)) for i in range(5): plt.subplot(2, 5, i + 1) plt.imshow(x_sample[i], cmap="gray") plt.subplot(2, 5, 5 + i + 1) plt.imshow(y_pred[i], cmap="gray") # # tbwriter.add_figure("VAE Example", figure, epochs) plt.show()
def main( debug: bool = False, eager: bool = False, logdir: str = "runs", steps_per_epoch: int = 200, epochs: int = 100, ): if debug: import debugpy print("Waiting for debugger...") debugpy.listen(5678) debugpy.wait_for_client() current_time = datetime.now().strftime("%b%d_%H-%M-%S") logdir = os.path.join(logdir, current_time) train_dataset = MNIST(training=True) test_dataset = MNIST(training=False) train_loader = elegy.data.DataLoader(train_dataset, batch_size=64, shuffle=True) test_loader = elegy.data.DataLoader(test_dataset, batch_size=64, shuffle=True) print("X_train:", train_dataset.x.shape, train_dataset.x.dtype) print("y_train:", train_dataset.y.shape, train_dataset.y.dtype) print("X_test:", test_dataset.x.shape, test_dataset.x.dtype) print("y_test:", test_dataset.y.shape, test_dataset.y.dtype) class MLP(elegy.Module): """Standard LeNet-300-100 MLP network.""" def __init__(self, n1: int = 300, n2: int = 100, **kwargs): super().__init__(**kwargs) self.n1 = n1 self.n2 = n2 def call(self, image: jnp.ndarray): image = image.astype(jnp.float32) / 255.0 mlp = elegy.nn.sequential( elegy.nn.Flatten(), elegy.nn.Linear(self.n1), jax.nn.relu, elegy.nn.Linear(self.n2), jax.nn.relu, elegy.nn.Linear(10), ) return mlp(image) model = elegy.Model( module=MLP(n1=300, n2=100), loss=[ elegy.losses.SparseCategoricalCrossentropy(from_logits=True), elegy.regularizers.GlobalL2(l=1e-4), ], metrics=elegy.metrics.SparseCategoricalAccuracy(), optimizer=optax.adamw(1e-3), run_eagerly=eager, ) x_sample, y_sample = next(iter(train_loader)) model.summary(x_sample) history = model.fit( x=train_loader, epochs=epochs, steps_per_epoch=steps_per_epoch, validation_data=test_loader, shuffle=True, callbacks=[elegy.callbacks.TensorBoard(logdir=logdir)], ) elegy.utils.plot_history(history) # get random samples idxs = np.random.randint(0, 10000, size=(9, )) x_sample, y_sample = next(iter(test_loader)) # get predictions y_pred = model.predict(x=x_sample) # plot and save results def make_plot(): plt.figure(figsize=(12, 12)) for i in range(3): for j in range(3): k = 3 * i + j plt.subplot(3, 3, k + 1) plt.title(f"{np.argmax(y_pred[k])}") plt.imshow(x_sample[k], cmap="gray") with SummaryWriter(os.path.join(logdir, "val")) as tbwriter: make_plot() # tbwriter.add_figure("Predictions", plt.gcf(), 100) make_plot() plt.show() print( "\n\n\nMetrics and images can be explored using tensorboard using:", f"\n \t\t\t tensorboard --logdir {logdir}", )
def main( debug: bool = False, eager: bool = False, logdir: str = "runs", steps_per_epoch: int = 200, epochs: int = 100, batch_size: int = 64, ): if debug: import debugpy print("Waiting for debugger...") debugpy.listen(5678) debugpy.wait_for_client() current_time = datetime.now().strftime("%b%d_%H-%M-%S") logdir = os.path.join(logdir, current_time) dataset = load_dataset("mnist") dataset.set_format("np") X_train = np.stack(dataset["train"]["image"])[..., None] y_train = dataset["train"]["label"] X_test = np.stack(dataset["test"]["image"])[..., None] y_test = dataset["test"]["label"] print("X_train:", X_train.shape, X_train.dtype) print("y_train:", y_train.shape, y_train.dtype) print("X_test:", X_test.shape, X_test.dtype) print("y_test:", y_test.shape, y_test.dtype) model = eg.Model( module=CNN(), loss=eg.losses.Crossentropy(), metrics=eg.metrics.Accuracy(), optimizer=optax.adam(1e-3), eager=eager, ) # show summary model.summary(X_train[:64]) train_dataset = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train)) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_dataset = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test)) test_dataloader = DataLoader(test_dataset, batch_size=batch_size) history = model.fit( train_dataloader, epochs=epochs, steps_per_epoch=steps_per_epoch, validation_data=test_dataloader, callbacks=[eg.callbacks.TensorBoard(logdir=logdir)], ) eg.utils.plot_history(history) model.save("models/conv") model = eg.load("models/conv") print(model.evaluate(x=X_test, y=y_test)) # get random samples idxs = np.random.randint(0, 10000, size=(9, )) x_sample = X_test[idxs] # get predictions y_pred = model.predict(x=x_sample) # plot results with SummaryWriter(os.path.join(logdir, "val")) as tbwriter: figure = plt.figure(figsize=(12, 12)) for i in range(3): for j in range(3): k = 3 * i + j plt.subplot(3, 3, k + 1) plt.title(f"{np.argmax(y_pred[k])}") plt.imshow(x_sample[k], cmap="gray") # tbwriter.add_figure("Conv classifier", figure, 100) plt.show()
def main( debug: bool = False, eager: bool = False, logdir: str = "runs", steps_per_epoch: int = 200, epochs: int = 100, batch_size: int = 64, ): if debug: import debugpy print("Waiting for debugger...") debugpy.listen(5678) debugpy.wait_for_client() current_time = datetime.now().strftime("%b%d_%H-%M-%S") logdir = os.path.join(logdir, current_time) dataset = load_dataset("mnist") dataset.set_format("np") X_train = np.stack(dataset["train"]["image"]) X_test = np.stack(dataset["test"]["image"]) print("X_train:", X_train.shape, X_train.dtype) print("X_test:", X_test.shape, X_test.dtype) model = eg.Model( module=MLP(n1=256, n2=64), loss=MeanSquaredError(), optimizer=optax.rmsprop(0.001), eager=eager, ) model.summary(X_train[:64]) # Notice we are not passing `y` history = model.fit( inputs=X_train, epochs=epochs, steps_per_epoch=steps_per_epoch, batch_size=batch_size, validation_data=(X_test, ), shuffle=True, callbacks=[eg.callbacks.TensorBoard(logdir=logdir, update_freq=300)], ) eg.utils.plot_history(history) # get random samples idxs = np.random.randint(0, 10000, size=(5, )) x_sample = X_test[idxs] # get predictions y_pred = model.predict(x=x_sample) # plot and save results with SummaryWriter(os.path.join(logdir, "val")) as tbwriter: figure = plt.figure(figsize=(12, 12)) for i in range(5): plt.subplot(2, 5, i + 1) plt.imshow(x_sample[i], cmap="gray") plt.subplot(2, 5, 5 + i + 1) plt.imshow(y_pred[i], cmap="gray") plt.show()
def on_train_begin(self, logs=None): self.writer = SummaryWriter(self.logdir, **self._open_args)
def main( steps_per_epoch: int = 200, batch_size: int = 64, epochs: int = 50, debug: bool = False, eager: bool = False, logdir: str = "runs", ): if debug: import debugpy print("Waiting for debugger...") debugpy.listen(5678) debugpy.wait_for_client() current_time = datetime.now().strftime("%b%d_%H-%M-%S") logdir = os.path.join(logdir, current_time) X_train, _1, X_test, _2 = dataget.image.mnist(global_cache=True).get() # Now binarize data X_train = (X_train > 0).astype(jnp.float32) X_test = (X_test > 0).astype(jnp.float32) print("X_train:", X_train.shape, X_train.dtype) print("X_test:", X_test.shape, X_test.dtype) vae = VariationalAutoEncoder(latent_size=LATENT_SIZE) model = elegy.Model( module=vae, loss=[BinaryCrossEntropy(from_logits=True, on="logits")], optimizer=optax.adam(1e-3), run_eagerly=eager, ) model.summary(X_train[:64]) # Fit with datasets in memory history = model.fit( x=X_train, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps_per_epoch, validation_data=(X_test, ), shuffle=True, callbacks=[TensorBoard(logdir)], ) print( "\n\n\nMetrics and images can be explored using tensorboard using:", f"\n \t\t\t tensorboard --logdir {logdir}", ) elegy.utils.plot_history(history) # get random samples idxs = np.random.randint(0, len(X_test), size=(5, )) x_sample = X_test[idxs] # get predictions y_pred = model.predict(x=x_sample) # plot and save results with SummaryWriter(os.path.join(logdir, "val")) as tbwriter: figure = plt.figure(figsize=(12, 12)) for i in range(5): plt.subplot(2, 5, i + 1) plt.imshow(x_sample[i], cmap="gray") plt.subplot(2, 5, 5 + i + 1) plt.imshow(y_pred["det_image"][i], cmap="gray") # tbwriter.add_figure("VAE Example", figure, epochs) plt.show() # call update_modules to enable parameter transfer # for now only Elegy Modules support this model.update_modules() # sample model_decoder = elegy.Model(vae.decoder) z_samples = np.random.normal(size=(12, LATENT_SIZE)) samples = model_decoder.predict(z_samples, initialize=True) samples = jax.nn.sigmoid(samples) # plot and save results # with SummaryWriter(os.path.join(logdir, "val")) as tbwriter: figure = plt.figure(figsize=(5, 12)) plt.title("Generative Samples") for i in range(5): plt.subplot(2, 5, 2 * i + 1) plt.imshow(samples[i], cmap="gray") plt.subplot(2, 5, 2 * i + 2) plt.imshow(samples[i + 1], cmap="gray") # # tbwriter.add_figure("VAE Generative Example", figure, epochs) plt.show()
from sklearn.decomposition import PCA from sklearn.metrics import confusion_matrix import tf_robustify import vgg import carlini_wagner_attack os.system("taskset -p 0xffffffff %d" % os.getpid()) import sh sh.rm('-rf', 'logs') import logging logging.basicConfig(level=logging.INFO, stream=sys.stdout) from tensorboardX.writer import SummaryWriter swriter = SummaryWriter('logs') add_scalar_old = swriter.add_scalar def add_scalar_and_log(key, value, global_step=0): logging.info('{}:{}: {}'.format(global_step, key, value)) add_scalar_old(key, value, global_step) swriter.add_scalar = add_scalar_and_log def str2bool(x): return x.lower() == 'true'
class TensorBoard(Callback): """ Callback that streams epoch results to tensorboard events folder. Supports all values that can be represented as a string, including 1D iterables such as `np.ndarray`. ```python tensorboard_logger = TensorBoard('runs') model.fit(X_train, Y_train, callbacks=[tensorboard_logger]) ``` """ def __init__( self, logdir: Optional[str] = None, *, update_freq: Union[str, int] = "epoch", purge_step: Optional[int] = None, comment: str = "", ) -> None: """ Arguments: logdir: Save directory location. Default is runs/**CURRENT_DATETIME_HOSTNAME**/{train, val}, which changes after each run. Use hierarchical folder structure to compare between runs easily. e.g. pass in 'runs/exp1', 'runs/exp2', etc. for each new experiment to compare across them. update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`, writes the losses and metrics to TensorBoard after each batch. The same applies for `'epoch'`. If using an integer, let's say `1000`, the callback will write the metrics and losses to TensorBoard every 1000 batches. Note that writing too frequently to TensorBoard can slow down your training. purge_step (int): When logging crashes at step :math:`T+X` and restarts at step :math:`T`, any events whose global_step larger or equal to :math:`T` will be purged and hidden from TensorBoard. Note that crashed and resumed experiments should have the same ``logdir``. comment (string): Comment logdir suffix appended to the default ``logdir``. If ``logdir`` is assigned, this argument has no effect. """ if not logdir: import socket from datetime import datetime current_time = datetime.now().strftime("%b%d_%H-%M-%S") self.logdir = os.path.join( "runs", current_time + "_" + socket.gethostname() + comment ) else: self.logdir = logdir self.train_writer = None self.val_writer = None self.keys = None self.write_per_batch = True try: self.update_freq = int(update_freq) except ValueError as e: self.update_freq = 1 if update_freq == "batch": self.write_per_batch = True elif update_freq == "epoch": self.write_per_batch = False else: raise e self.purge_step = purge_step super(TensorBoard, self).__init__() def on_train_begin(self, logs=None): self.train_writer = SummaryWriter( os.path.join(self.logdir, "train"), purge_step=self.purge_step ) self.val_writer = SummaryWriter( os.path.join(self.logdir, "val"), purge_step=self.purge_step ) self.steps = self.params["steps"] self.global_step = 0 def on_train_batch_end(self, batch: int, logs=None): if not self.write_per_batch: return logs = logs or {} self.global_step = batch + self.current_epoch * (self.steps) if self.global_step % self.update_freq == 0: if self.keys is None: self.keys = logs.keys() for key in self.keys: self.train_writer.add_scalar(key, logs[key], self.global_step) def on_epoch_begin(self, epoch: int, logs=None): self.current_epoch = epoch def on_epoch_end(self, epoch, logs=None): logs = logs or {} if self.keys is None: self.keys = logs.keys() # logs on on_{train, test}_batch_end do not have val metrics if self.write_per_batch: for key in logs: if "val" in key: self.val_writer.add_scalar( key.replace("val_", ""), logs[key], self.global_step ) return elif epoch % self.update_freq == 0: for key in self.keys: if "val" in key: self.val_writer.add_scalar( key.replace("val_", ""), logs[key], epoch ) else: self.train_writer.add_scalar(key, logs[key], epoch) def on_train_end(self, logs=None): self.train_writer.close() self.val_writer.close()
def main( debug: bool = False, eager: bool = False, logdir: str = "runs", steps_per_epoch: int = 200, epochs: int = 100, batch_size: int = 64, ): if debug: import debugpy print("Waiting for debugger...") debugpy.listen(5678) debugpy.wait_for_client() current_time = datetime.now().strftime("%b%d_%H-%M-%S") logdir = os.path.join(logdir, current_time) X_train, y_train, X_test, y_test = dataget.image.mnist( global_cache=True).get() X_train = X_train[..., None] X_test = X_test[..., None] print("X_train:", X_train.shape, X_train.dtype) print("y_train:", y_train.shape, y_train.dtype) print("X_test:", X_test.shape, X_test.dtype) print("y_test:", y_test.shape, y_test.dtype) class CNN(elegy.Module): def call(self, image: jnp.ndarray, training: bool): @elegy.to_module def ConvBlock(x, units, kernel, stride=1): x = elegy.nn.Conv2D(units, kernel, stride=stride, padding="same")(x) x = elegy.nn.BatchNormalization()(x, training) x = elegy.nn.Dropout(0.2)(x, training) return jax.nn.relu(x) x: np.ndarray = image.astype(jnp.float32) / 255.0 # base x = ConvBlock()(x, 32, [3, 3]) x = ConvBlock()(x, 64, [3, 3], stride=2) x = ConvBlock()(x, 64, [3, 3], stride=2) x = ConvBlock()(x, 128, [3, 3], stride=2) # GlobalAveragePooling2D x = jnp.mean(x, axis=[1, 2]) # 1x1 Conv x = elegy.nn.Linear(10)(x) return x model = elegy.Model( module=CNN(), loss=elegy.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=elegy.metrics.SparseCategoricalAccuracy(), optimizer=optax.adam(1e-3), run_eagerly=eager, ) # show model summary model.summary(X_train[:64], depth=1) history = model.fit( x=X_train, y=y_train, epochs=epochs, steps_per_epoch=steps_per_epoch, batch_size=batch_size, validation_data=(X_test, y_test), shuffle=True, callbacks=[TensorBoard(logdir=logdir)], ) elegy.utils.plot_history(history) model.save("models/conv") model = elegy.load("models/conv") print(model.evaluate(x=X_test, y=y_test)) # get random samples idxs = np.random.randint(0, 10000, size=(9, )) x_sample = X_test[idxs] # get predictions y_pred = model.predict(x=x_sample) # plot results with SummaryWriter(os.path.join(logdir, "val")) as tbwriter: figure = plt.figure(figsize=(12, 12)) for i in range(3): for j in range(3): k = 3 * i + j plt.subplot(3, 3, k + 1) plt.title(f"{np.argmax(y_pred[k])}") plt.imshow(x_sample[k], cmap="gray") # tbwriter.add_figure("Conv classifier", figure, 100) plt.show()
def main( debug: bool = False, eager: bool = False, logdir: str = "runs", steps_per_epoch: int = 200, batch_size: int = 64, epochs: int = 100, size: int = 32, num_layers: int = 3, num_heads: int = 8, dropout: float = 0.0, ): if debug: import debugpy print("Waiting for debugger...") debugpy.listen(5678) debugpy.wait_for_client() current_time = datetime.now().strftime("%b%d_%H-%M-%S") logdir = os.path.join(logdir, current_time) X_train, y_train, X_test, y_test = dataget.image.mnist( global_cache=True).get() print("X_train:", X_train.shape, X_train.dtype) print("y_train:", y_train.shape, y_train.dtype) print("X_test:", X_test.shape, X_test.dtype) print("y_test:", y_test.shape, y_test.dtype) model = elegy.Model( module=ViT( size=size, num_layers=num_layers, num_heads=num_heads, dropout=dropout, ), loss=[ elegy.losses.SparseCategoricalCrossentropy(from_logits=True), # elegy.regularizers.GlobalL2(l=1e-4), ], metrics=elegy.metrics.SparseCategoricalAccuracy(), optimizer=optax.adamw(1e-3), run_eagerly=eager, ) model.init(X_train, y_train) model.summary(X_train[:64]) history = model.fit( x=X_train, y=y_train, epochs=epochs, steps_per_epoch=steps_per_epoch, batch_size=batch_size, validation_data=(X_test, y_test), shuffle=True, callbacks=[elegy.callbacks.TensorBoard(logdir=logdir)], ) elegy.utils.plot_history(history) # get random samples idxs = np.random.randint(0, 10000, size=(9, )) x_sample = X_test[idxs] # get predictions y_pred = model.predict(x=x_sample) # plot and save results with SummaryWriter(os.path.join(logdir, "val")) as tbwriter: figure = plt.figure(figsize=(12, 12)) for i in range(3): for j in range(3): k = 3 * i + j plt.subplot(3, 3, k + 1) plt.title(f"{np.argmax(y_pred[k])}") plt.imshow(x_sample[k], cmap="gray") # tbwriter.add_figure("Predictions", figure, 100) plt.show() print( "\n\n\nMetrics and images can be explored using tensorboard using:", f"\n \t\t\t tensorboard --logdir {logdir}", )
def main(debug: bool = False, eager: bool = False, logdir: str = "runs"): if debug: import debugpy print("Waiting for debugger...") debugpy.listen(5678) debugpy.wait_for_client() current_time = datetime.now().strftime("%b%d_%H-%M-%S") logdir = os.path.join(logdir, current_time) X_train, _1, X_test, _2 = dataget.image.mnist(global_cache=True).get() print("X_train:", X_train.shape, X_train.dtype) print("X_test:", X_test.shape, X_test.dtype) class MLP(elegy.Module): """Standard LeNet-300-100 MLP network.""" def __init__(self, n1: int = 300, n2: int = 100, **kwargs): super().__init__(**kwargs) self.n1 = n1 self.n2 = n2 def call(self, image: jnp.ndarray): image = image.astype(jnp.float32) / 255.0 x = elegy.nn.Flatten()(image) x = elegy.nn.sequential( elegy.nn.Linear(self.n1), jax.nn.relu, elegy.nn.Linear(self.n2), jax.nn.relu, elegy.nn.Linear(self.n1), jax.nn.relu, elegy.nn.Linear(x.shape[-1]), jax.nn.sigmoid, )(x) return x.reshape(image.shape) * 255 class MeanSquaredError(elegy.losses.MeanSquaredError): # we request `x` instead of `y_true` since we are don't require labels in autoencoders def call(self, x, y_pred): return super().call(x, y_pred) model = elegy.Model( module=MLP(n1=256, n2=64), loss=MeanSquaredError(), optimizer=optax.rmsprop(0.001), run_eagerly=eager, ) model.summary(X_train[:64]) # Notice we are not passing `y` history = model.fit( x=X_train, epochs=20, batch_size=64, validation_data=(X_test,), shuffle=True, callbacks=[elegy.callbacks.TensorBoard(logdir=logdir, update_freq=300)], ) plot_history(history) # get random samples idxs = np.random.randint(0, 10000, size=(5,)) x_sample = X_test[idxs] # get predictions y_pred = model.predict(x=x_sample) # plot and save results with SummaryWriter(os.path.join(logdir, "val")) as tbwriter: figure = plt.figure(figsize=(12, 12)) for i in range(5): plt.subplot(2, 5, i + 1) plt.imshow(x_sample[i], cmap="gray") plt.subplot(2, 5, 5 + i + 1) plt.imshow(y_pred[i], cmap="gray") # tbwriter.add_figure("AutoEncoder images", figure, 20) plt.show() print( "\n\n\nMetrics and images can be explored using tensorboard using:", f"\n \t\t\t tensorboard --logdir {logdir}", )
def main( debug: bool = False, eager: bool = False, logdir: str = "runs", steps_per_epoch: int = 200, batch_size: int = 64, epochs: int = 100, ): if debug: import debugpy print("Waiting for debugger...") debugpy.listen(5678) debugpy.wait_for_client() current_time = datetime.now().strftime("%b%d_%H-%M-%S") logdir = os.path.join(logdir, current_time) dataset = load_dataset("mnist") dataset.set_format("np") X_train = np.stack(dataset["train"]["image"]) y_train = dataset["train"]["label"] X_test = np.stack(dataset["test"]["image"]) y_test = dataset["test"]["label"] print("X_train:", X_train.shape, X_train.dtype) print("y_train:", y_train.shape, y_train.dtype) print("X_test:", X_test.shape, X_test.dtype) print("y_test:", y_test.shape, y_test.dtype) model = eg.Model( module=MLP(n1=300, n2=100), loss=[ eg.losses.Crossentropy(), eg.regularizers.L2(l=1e-4), ], metrics=eg.metrics.Accuracy(), optimizer=optax.adamw(1e-3), eager=eager, ) model.summary(X_train[:64]) history = model.fit( inputs=X_train, labels=y_train, epochs=epochs, steps_per_epoch=steps_per_epoch, batch_size=batch_size, validation_data=(X_test, y_test), shuffle=True, callbacks=[eg.callbacks.TensorBoard(logdir=logdir)], ) eg.utils.plot_history(history) # get random samples idxs = np.random.randint(0, 10000, size=(9,)) x_sample = X_test[idxs] # get predictions y_pred = model.predict(x=x_sample) # plot and save results with SummaryWriter(os.path.join(logdir, "val")) as tbwriter: figure = plt.figure(figsize=(12, 12)) for i in range(3): for j in range(3): k = 3 * i + j plt.subplot(3, 3, k + 1) plt.title(f"{np.argmax(y_pred[k])}") plt.imshow(x_sample[k], cmap="gray") # tbwriter.add_figure("Predictions", figure, 100) plt.show() print( "\n\n\nMetrics and images can be explored using tensorboard using:", f"\n \t\t\t tensorboard --logdir {logdir}", )
def main( steps_per_epoch: tp.Optional[int] = None, batch_size: int = 32, epochs: int = 50, debug: bool = False, eager: bool = False, logdir: str = "runs", ): if debug: import debugpy print("Waiting for debugger...") debugpy.listen(5678) debugpy.wait_for_client() current_time = datetime.now().strftime("%b%d_%H-%M-%S") logdir = os.path.join(logdir, current_time) dataset = load_dataset("mnist") dataset.set_format("np") X_train = np.array(np.stack(dataset["train"]["image"]), dtype=np.uint8) X_test = np.array(np.stack(dataset["test"]["image"]), dtype=np.uint8) # Now binarize data X_train = (X_train / 255.0).astype(jnp.float32) X_test = (X_test / 255.0).astype(jnp.float32) print("X_train:", X_train.shape, X_train.dtype) print("X_test:", X_test.shape, X_test.dtype) model = eg.Model( module=VariationalAutoEncoder(latent_size=LATENT_SIZE), loss=[BinaryCrossEntropy(from_logits=True, on="logits")], optimizer=optax.adam(1e-3), eager=eager, ) assert model.module is not None model.summary(X_train[:64]) # Fit with datasets in memory history = model.fit( inputs=X_train, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps_per_epoch, validation_data=(X_test, ), shuffle=True, callbacks=[eg.callbacks.TensorBoard(logdir)], ) print( "\n\n\nMetrics and images can be explored using tensorboard using:", f"\n \t\t\t tensorboard --logdir {logdir}", ) eg.utils.plot_history(history) # get random samples idxs = np.random.randint(0, len(X_test), size=(5, )) x_sample = X_test[idxs] # get predictions y_pred = model.predict(x=x_sample) # plot and save results with SummaryWriter(os.path.join(logdir, "val")) as tbwriter: figure = plt.figure(figsize=(12, 12)) for i in range(5): plt.subplot(2, 5, i + 1) plt.imshow(x_sample[i], cmap="gray") plt.subplot(2, 5, 5 + i + 1) plt.imshow(y_pred["det_image"][i], cmap="gray") # tbwriter.add_figure("VAE Example", figure, epochs) # sample model_decoder = eg.Model(model.module.decoder) z_samples = np.random.normal(size=(12, LATENT_SIZE)) samples = model_decoder.predict(z_samples) samples = jax.nn.sigmoid(samples) # plot and save results # with SummaryWriter(os.path.join(logdir, "val")) as tbwriter: figure = plt.figure(figsize=(5, 12)) plt.title("Generative Samples") for i in range(5): plt.subplot(2, 5, 2 * i + 1) plt.imshow(samples[i], cmap="gray") plt.subplot(2, 5, 2 * i + 2) plt.imshow(samples[i + 1], cmap="gray") # # tbwriter.add_figure("VAE Generative Example", figure, epochs) plt.show()
import torch.nn.parallel import torch.optim import torch.utils.data from progress.bar import Bar from tensorboardX.writer import SummaryWriter from termcolor import cprint from model import shape_net from datasets import SIK1M from losses import shape_loss # select proper device to run from utils import misc from utils.eval.evalutils import AverageMeter import numpy as np writer = SummaryWriter('log') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") cudnn.benchmark = True steps = 0 def print_args(args): opts = vars(args) cprint("{:>30} Options {}".format("=" * 15, "=" * 15), 'yellow') for k, v in sorted(opts.items()): print("{:>30} : {}".format(k, v)) cprint("{:>30} Options {}".format("=" * 15, "=" * 15), 'yellow') def main(args):