def visualize(logging_file_path): file = open(logging_file_path) log_entrys = file.readlines() writer = SummaryWriter() n_iter = 0 config_dict = dict() score_dict = dict() for log_entry in log_entrys: if re.match('\[INFO\]', log_entry) is not None: # start parsing # print("new iteration:") n_iter += 1 config_dict = dict() score_dict = dict() elif re.match(', result is:', log_entry) is None: # continue parsing configuration search_obj = re.search(r'(.*), Value: (.*)', log_entry) config_dict[str(search_obj.group(1))] = float(search_obj.group(2)) # print('key is: ', str(search_obj.group(1)), ' value is : ', config_dict[str(search_obj.group(1))]) else: # parsing performance and end search_obj = re.search(', result is: (.*)', log_entry) score_dict['_perf'] = float(search_obj.group(1)) # print("end parsing------------") # print("config_dict = ", config_dict) # print("score_dicr = ", score_dict) writer.add_hparams(config_dict, score_dict, name="trial" + str(n_iter)) writer.add_scalar('_perf', score_dict['_perf'], n_iter) writer.add_scalars('data/timeline', score_dict, n_iter) # writer.export_scalars_to_json("./all_scalars.json") writer.close()
def main(): _current_datetime = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") ## arguments ## parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) # train parser.add_argument('--warmup', default=10000, type=int, help='number of warmup steps') parser.add_argument('--episode', default=1200, type=int, help='upper limit of training episodes') parser.add_argument('--capacity', default=10000, type=int, help='capacity of replay buffer') parser.add_argument('--batch_size', default=128, type=int, help='mini batch size extract from replay buffer') parser.add_argument('--lr', default=.0005, type=float, help='learning rate') parser.add_argument('--eps_decay', default=.995, type=float, help='epsilon decay rate') parser.add_argument('--eps_min', default=.01, type=float, help='lower bound of epsilon') parser.add_argument('--gamma', default=.99, type=float, help='gamma for update Q value') parser.add_argument('--freq', default=4, type=int, help='interval to update behavior network') parser.add_argument('--target_freq', default=1000, type=int, help='interval to update target network') # test parser.add_argument('--test_only', action='store_true', help='conduct test only runs') parser.add_argument('--render', default=False, action='store_true', help='render display') parser.add_argument('--test_epsilon', default=.001, type=float, help='test epsilon') # utilities parser.add_argument('-d', '--device', default='cuda', help='device used for training / testing') parser.add_argument('-m', '--model', default='models/ddqn-{}.pth'.format(_current_datetime), help='path to pretrained model / model save path') parser.add_argument('--logdir', default='log/ddqn/{}'.format(_current_datetime), help='path to tensorboard log') parser.add_argument('--seed', default=2021111, type=int, help='random seed') args = parser.parse_args() ## main ## env_name = 'LunarLander-v2' agent = DDQN(args) writer = SummaryWriter(args.logdir) if not args.test_only: os.makedirs('checkpoints', exist_ok=True) os.makedirs('models', exist_ok=True) ewma_reward = train(args, env_name, agent, writer) writer.add_hparams(args.__dict__,{'hparams/Ewma Reward': ewma_reward}) agent.save(args.model) agent.load(args.model) test(args, env_name, agent, writer)
class TensorboardX: def __init__(self, logdir, run_name): self._logdir = logdir self._writer = SummaryWriter(logdir=logdir) self.run_name = run_name def args(self, arg_text): self._writer.add_text("args", arg_text) def meta(self, params): self._writer.add_hparams(hparam_dict=params, metric_dict={}) def log(self, name, value, step): self._writer.add_scalar(name, value, step)
def init_tensorboard(self): from tensorboardX import SummaryWriter writer = SummaryWriter() data = { "agent_config": self._agent_config, "exp_config": self._exp_config } writer.add_hparams(self._exp_config, {}) writer.add_hparams(self._agent_config, {}) return writer
def visualize(logging_file_path): print('---------entering visualize-----------') file = open(logging_file_path) log_entrys = file.readlines() # writer = SummaryWriter(write_to_disk=False) # create the SummaryWriter Object writer = SummaryWriter( write_to_disk=True) # create the SummaryWriter Object n_iter = 0 config_dict = dict() score_dict = dict() score_bound_dict = dict() for log_entry in log_entrys: if re.match('\[INFO\]', log_entry) is not None: # start parsing # print("new iteration:") n_iter += 1 config_dict = dict() score_dict = dict() elif re.match(', result is:', log_entry) is None: # continue parsing configuration search_obj = re.search(r'(.*), Value: (.*)', log_entry) if search_obj is None: search_obj = re.search(r'(.*), Constant: (.*)', log_entry) config_dict[str(search_obj.group(1))] = float(search_obj.group(2)) print('key is: ', str(search_obj.group(1)), ' value is : ', config_dict[str(search_obj.group(1))]) else: # parsing performance and end search_obj = re.search(', result is: (.*)', log_entry) score_dict['_perf'] = float(search_obj.group(1)) if n_iter == 1: lower_bound = score_dict['_perf'] score_bound_dict = {'lower_bound': lower_bound} else: lower_bound = min(lower_bound, score_dict['_perf']) score_bound_dict = {'lower_bound': lower_bound} # print("end parsing------------") # print("config_dict = ", config_dict) # print("score_dicr = ", score_dict) writer.add_hparams(config_dict, score_dict, name="trial" + str(n_iter)) writer.add_scalar('_perf', score_dict['_perf'], n_iter) writer.add_scalars('data/score_bound', score_bound_dict, n_iter) # writer.export_scalars_to_json("./all_scalars.json") writer.close()
def main(cfg: omegaconf.DictConfig): # create the environment env = atari_wrappers.make_env(cfg.exp.env) env = gym.wrappers.Monitor(env, "recording/", force=True) obs = env.reset() # TensorBoard writer = SummaryWriter() writer.add_hparams(flatten_dict(cfg), {}) logger.info('Hyperparams:', cfg) # create the agent agent = DQNAgent(env, device=cfg.train.device, summary_writer=writer, cfg=cfg) n_games = 0 max_mean_40_reward = -sys.maxsize # Play MAX_N_GAMES games while n_games < cfg.train.max_episodes: # act greedly action = agent.act_eps_greedy(obs) # one step on the environment new_obs, reward, done, _ = env.step(action) # add the environment feedback to the agent agent.add_env_feedback(obs, action, new_obs, reward, done) # sample and optimize NB: the agent could wait to have enough memories agent.sample_and_optimize(cfg.train.batch_size) obs = new_obs if done: n_games += 1 agent.print_info() agent.reset_stats() obs = env.reset() if agent.rewards: current_mean_40_reward = np.mean(agent.rewards[-40:]) if current_mean_40_reward > max_mean_40_reward: agent.save_model(cfg.train.best_checkpoint) writer.close()
class TensorboardSession(Session): writer: SummaryWriter def __init__(self, source_paths: Union[List[str], str], **kwargs) -> None: self.writer = SummaryWriter() source_md = [] for path in source_paths: if isfile(path): fs = open(path, mode="r") source_md.append(f"* {basename(path)}\n\n\"\"\"python") source_md.append(fs.read()) source_md.append("\"\"\"\n\n") fs.close() else: print(f"CometSession: Warning, No such file - {path}") self.writer.add_text("Source codes", "\n".join(source_md)) def log_parameters(self, params: Dict[str, Any]) -> None: self.writer.add_hparams(params) def log_metric(self, val_name: str, value: Any) -> None: self.writer.add_scalar(val_name, value)
class Logger(object): """ Logger class to use tensorboard to visualize ANNarchy simulations. Requires the `tensorboardX` package (pip install tensorboardX). The Logger class is a thin wrapper around tensorboardX.SummaryWriter, which you could also use directly. The doc is available at <https://tensorboardx.readthedocs.io/>. Tensorboard can read any logging data, as long as they are saved in the right format (tfevents), so it is not limited to tensorflow. TensorboardX has been developed to allow the use of tensorboard with pytorch. The extension has to be imported explictly: ```python from ANNarchy.extensions.tensorboard import Logger ``` The ``Logger`` class has to be closed properly at the end of the script, so it is advised to use a context: ```python with Logger() as logger: logger.add_scalar("Accuracy", acc, trial) ``` You can also make sure to close it: ```python logger = Logger() logger.add_scalar("Accuracy", acc, trial) logger.close() ``` By default, the logs will be written in a subfolder of ``./runs/`` (which will be created in the current directory). The subfolder is a combination of the current datetime and of the hostname, e.g. ``./runs/Apr22_12-11-22_machine``. You can control these two elements by passing arguments to ``Logger()``: ```python with Logger(logdir="/tmp/annarchy", experiment="trial1"): # logs in /tmp/annarchy/trial1 ``` The ``add_*`` methods allow you to log various structures, such as scalars, images, histograms, figures, etc. A tag should be given to each plot. In the example above, the figure with the accuracy will be labelled "Accuracy" in tensorboard. You can also group plots together with tags such as "Global performance/Accuracy", "Global performance/Error rate", "Neural activity/Population 1", etc. After (or while) logging data within your simulation, run `tensorboard` in the terminal by specifying the log directory: ```bash tensorboard --logdir runs ``` TensorboardX enqueues the data in memory before writing to disk. You can force flushing with: ```python logger.flush() ``` """ def __init__(self, logdir="runs/", experiment=None): """ :param logdir: path (absolute or relative) to the logging directory. Subfolders will be created for each individual run. The default is "runs/" :param experiment: name of the subfolder for the current run. By default, it is a combination of the current time and the hostname (e.g. Apr22_12-11-22_machine). If you reuse an experiment name, the data will be appended. """ self.logdir = logdir self.experiment = experiment # Create the logdir if it does not exist if not os.path.exists(self.logdir): os.makedirs(self.logdir) if not experiment: current_time = datetime.now().strftime('%b%d_%H-%M-%S') self.currentlogdir = os.path.join( self.logdir, current_time + '_' + socket.gethostname()) else: self.currentlogdir = self.logdir + "/" + self.experiment print("Logging in", self.currentlogdir) self._create_summary_writer() def _create_summary_writer(self): self._summary = SummaryWriter(self.currentlogdir, comment="", purge_step=None, max_queue=10, flush_secs=10, filename_suffix='', write_to_disk=True) # Logging methods def add_scalar(self, tag, value, step=None): """ Logs a single scalar value, e.g. a success rate at various stages of learning. Example: ```python with Logger() as logger: for trial in range(100): simulate(1000.0) accuracy = ... logger.add_scalar("Accuracy", accuracy, trial) ``` :param tag: name of the figure in tensorboard. :param value: value. :param step: time index. """ self._summary.add_scalar(tag=tag, scalar_value=value, global_step=step, walltime=None) def add_scalars(self, tag, value, step=None): """ Logs multiple scalar values to be displayed in the same figure, e.g. several metrics or neural activities. Example: ```python with Logger() as logger: for trial in range(100): simulate(1000.0) act1 = pop.r[0] act2 = pop.r[1] logger.add_scalars( "Accuracy", {'First neuron': act1, 'Second neuron': act2}, trial) ``` :param tag: name of the figure in tensorboard. :param value: dictionary of values. :param step: time index. """ self._summary.add_scalars(main_tag=tag, tag_scalar_dict=value, global_step=step, walltime=None) def add_image(self, tag, img, step=None, equalize=False): """ Logs an image. The image must be a numpy array of size (height, width) for monochrome images or (height, width, 3) for colored images. The values should either be integers between 0 and 255 or floats between 0 and 1. The parameter ``equalize`` forces the values to be between 0 and 1 by equalizing using the min/max values. Example:: ```python with Logger() as logger: for trial in range(100): simulate(1000.0) img = pop.r.reshape((10, 10)) logger.add_image("Population / Firing rate", img, trial, equalize=True) ``` :param tag: name of the figure in tensorboard. :param img: array for the image. :param step: time index. :param equalize: rescales the pixels between 0 and 1 using the min and max values of the array. """ if img.ndim == 2: if equalize: img = img.astype(np.float) img = (img - img.min()) / (img.max() - img.min()) self._summary.add_image(tag=tag, img_tensor=img, global_step=step, walltime=None, dataformats='HW') elif img.ndim == 3: if not img.shape[2] == 3: Global._error( "Logger.add_image: color images must be of shape (H, W, 3)." ) if equalize: img = np.array(img).astype(np.float) img = (img - img.min()) / (img.max() - img.min()) self._summary.add_image(tag=tag, img_tensor=img, global_step=step, walltime=None, dataformats='HWC') else: Global._error( "Logger.add_image: images must be of shape (H, W) or (H, W, 3)." ) def add_images(self, tag, img, step=None, equalize=False, equalize_per_image=False): """ Logs a set of images (e.g. receptive fields). The numpy array must be of size (number, height, width) for monochrome images or (number, height, width, 3) for colored images. The values should either be integers between 0 and 255 or floats between 0 and 1. The parameter ``equalize`` forces the values to be between 0 and 1 by equalizing using the min/max values. Example: ```python with Logger() as logger: for trial in range(100): simulate(1000.0) weights= proj.w.reshape(100, 10, 10) # 100 post neurons, 10*10 pre neurons logger.add_images("Projection/Receptive fields", weights, trial, equalize=True) ``` :param tag: name of the figure in tensorboard. :param img: array for the images. :param step: time index. :param equalize: rescales the pixels between 0 and 1 using the min and max values of the array. :param equalize_per_image: whether the rescaling should be using the global min/max values of the array, or per image. Has no effect if equalize of False. """ if img.ndim == 3: img = np.expand_dims(img, axis=3) if equalize: img = np.array(img).astype(np.float) if not equalize_per_image: img = (img - img.min()) / (img.max() - img.min()) else: for i in range(img.shape[0]): img[i, ...] = (img[i, ...] - img[i, ...].min()) / ( img[i, ...].max() - img[i, ...].min()) self._summary.add_images(tag=tag, img_tensor=img, global_step=step, walltime=None, dataformats='NHWC') def add_parameters(self, params, metrics): """ Logs parameters of a simulation. This should be run only once per simulation, generally at the end. This allows to compare different runs of the same network using different parameter values and study how they influence the global output metrics, such as accuracy, error rate, reaction speed, etc. Example: ```python with Logger() as logger: # ... logger.add_parameters({'learning_rate': lr, 'tau': tau}, {'accuracy': accuracy}) ``` :param params: dictionary of parameters. :param metrics: dictionary of metrics. """ self._summary.add_hparams(params, metrics) def add_histogram(self, tag, hist, step=None): """ Logs an histogram. Example: ```python with Logger() as logger: for trial in range(100): simulate(1000.0) weights= proj.w.flatten() logger.add_histogram("Weight distribution", weights, trial) ``` :param tag: name of the figure in tensorboard. :param hist: a list or 1D numpy array of values. :param step: time index. """ self._summary.add_histogram(tag, hist, step) def add_figure(self, tag, figure, step=None, close=True): """ Logs a Matplotlib figure. Example: ```python with Logger() as logger: for trial in range(100): simulate(1000.0) fig = plt.figure() plt.plot(pop.r) logger.add_figure("Activity", fig, trial) ``` :param tag: name of the image in tensorboard. :param figure: a list or 1D numpy array of values. :param step: time index. :param close: whether the logger will close the figure when done (default: True). """ import matplotlib.pyplot as plt import matplotlib.backends.backend_agg as plt_backend_agg canvas = plt_backend_agg.FigureCanvasAgg(figure) canvas.draw() data = np.frombuffer(canvas.buffer_rgba(), dtype=np.uint8) w, h = figure.canvas.get_width_height() image_hwc = data.reshape([h, w, 4])[:, :, 0:3] image_chw = np.moveaxis(image_hwc, source=2, destination=0) if close: plt.close(figure) self._summary.add_image(tag, image_chw, step) # Resource management def flush(self): "Forces the logged data to be flushed to disk." self._summary.flush() def close(self): "Closes the logger." self._summary.close() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close()
class TensorboardXLogger(): def __init__(self, logs_dir="", writer=None): ''' Initialize a tensorboard logger Note that this logger relies on tensorboardx and only provide tensorboard hparams log. An ImportError will be raised for the lack of tensorboardx :param logs_dir: root directory for the log, default to the current working dir :param writer: shared tensorboardx SummaryWriter, default to None. ''' self.logs_dir = logs_dir self._file_writer = None try: from tensorboardX import SummaryWriter except ImportError: print("pip install tensorboardx to see TensorBoard log files.") raise if writer: self._file_writer = writer else: self._file_writer = SummaryWriter(logdir=self.logs_dir) def run(self, config, metric): ''' Write log files(event files) The log files is arranged as following: self.logs_dir |--eventfile_all |--Trail_1 | |--eventfile_1 |--Trail_2 | |--eventfile_2 ... :param config: A dictionary. Keys are trail name, value is a dictionary indicates the trail config :param metric: A dictionary. Keys are trail name, value is a dictionary indicates the trail metric results Example: Config = {"run1":{"lr":0.001, "hidden_units": 32}, "run2":{"lr":0.01, "hidden_units": 64}} Metric = {"run1":{"acc":0.91, "time": 32.13}, "run2":{"acc":0.93, "time": 61.33}} Note that the keys of config and metric should be exactly the same ''' # keys check assert config.keys() == metric.keys(),\ "The keys of config and metric should be exactly the same" # validation check new_metric = {} for key in metric.keys(): new_metric[key] = {} for k, value in metric[key].items(): if type(value) in VALID_SUMMARY_TYPES and not np.isnan(value): new_metric[key][k] = value new_config = {} for key in config.keys(): new_config[key] = {} for k, value in config[key].items(): if value is not None: new_config[key][k] = value # hparams log write for key in new_metric.keys(): # new_config[key]["address"] = key self._file_writer.add_hparams(new_config[key], new_metric[key]) def close(self): ''' Close the logger ''' self._file_writer.close()
sum_bpd = 0. sum_elbo_gap = 0. with torch.no_grad(): for (x, _) in tqdm.tqdm(test_loader): try: all_metrics = metrics(density, x, num_elbo_samples) sum_log_prob += all_metrics["log-prob"].sum().item() sum_bpd += all_metrics["bpd"].sum().item() sum_elbo_gap += all_metrics["elbo-gap"].sum().item() except Exception as e: import ipdb; ipdb.set_trace() print("Error {0} for path {1}".format(e, path)) points_in_test = test_loader.dataset.x.shape[0] metrics = { "bpd": sum_bpd / points_in_test, "log-prob": sum_log_prob / points_in_test, "elbo-gap": sum_elbo_gap / points_in_test, "epoch": checkpoint["epoch"], "num-params": num_params(density), "test-elbo-samples": num_elbo_samples } with open(metrics_path, "w") as f: json.dump(metrics, f, indent=4) metrics = {f"hparams/{k}": v for k, v in metrics.items()} writer = SummaryWriter(logdir=path) writer.add_hparams(hparam_dict=vals, metric_dict=metrics)
class D3RLPyLogger: def __init__(self, experiment_name, root_dir='logs', verbose=True, tensorboard=True, with_timestamp=True): self.verbose = verbose # add timestamp to prevent unintentional overwrites while True: if with_timestamp: date = datetime.now().strftime('%Y%m%d%H%M%S') self.experiment_name = experiment_name + '_' + date else: self.experiment_name = experiment_name self.logdir = os.path.join(root_dir, self.experiment_name) if not os.path.exists(self.logdir): os.makedirs(self.logdir) break else: if with_timestamp: time.sleep(1.0) else: raise ValueError('%s already exists.' % self.logdir) self.metrics_buffer = {} if tensorboard: from tensorboardX import SummaryWriter tfboard_path = os.path.join('runs', self.experiment_name) self.writer = SummaryWriter(logdir=tfboard_path) else: self.writer = None self.params = None def add_params(self, params): assert self.params is None, 'add_params can be called only once.' # save dictionary as json file with open(os.path.join(self.logdir, 'params.json'), 'w') as f: f.write(json.dumps(params, default=default_json_encoder)) if self.verbose: for key, val in params.items(): print('{}={}'.format(key, val)) # remove non-scaler values for HParams self.params = {k: v for k, v in params.items() if np.isscalar(v)} def add_metric(self, name, value): if name not in self.metrics_buffer: self.metrics_buffer[name] = [] self.metrics_buffer[name].append(value) def commit(self, epoch, step): metrics = {} for name, buffer in self.metrics_buffer.items(): metric = sum(buffer) / len(buffer) with open(os.path.join(self.logdir, name + '.csv'), 'a') as f: print('%d,%d,%f' % (epoch, step, metric), file=f) if self.verbose: print('epoch=%d step=%d %s=%f' % (epoch, step, name, metric)) if self.writer: self.writer.add_scalar('metrics/' + name, metric, epoch) metrics[name] = metric self.metrics_buffer[name] = [] if self.params and self.writer: self.writer.add_hparams(self.params, metrics, name=self.experiment_name, global_step=epoch) def save_model(self, epoch, algo): # save entire model model_path = os.path.join(self.logdir, 'model_%d.pt' % epoch) algo.save_model(model_path)
class BaseTrainer: """Base class for all trainers.""" def __init__(self, model, loss, metrics, optimizer, lr_scheduler, config): self.config = config self.hparams = get_hparams_from_config(self.config) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.lr_scheduler = lr_scheduler self.exp_dir = config.save_dir self.checkpoint_dir = config.save_dir self.perf_log_path = os.path.join(config.save_dir, 'perf_log.txt') self.info_checkpoint_path = os.path.join(config.save_dir, 'info_checkpoint.txt') self.monitoring_path = os.path.join(config.save_dir, 'monitoring.json') cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.monitor = cfg_trainer.get('monitor', 'off') self.timer = AverageMeter() # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 elif self.monitor.startswith('given_epoch'): self.mnt_mode, self.given_epoch = self.monitor.split() assert self.mnt_mode in ['given_epoch'] self.mnt_best = 0 self.given_epoch = int(self.given_epoch) else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 0 self.epoch = 0 self.n_samples = 0 self.n_steps = 0 self.writer = SummaryWriter(config.log_dir) self.include_optim_in_ckpts = config['trainer'].get( 'include_optim_in_ckpts', False) if config.resume is not None: self._resume_checkpoint(config.resume) @abc.abstractmethod def _train_epoch(self, epoch): """Training logic for an epoch.""" raise NotImplementedError @abc.abstractmethod def _valid_epoch(self, epoch, sets): """Validation logic for an epoch.""" raise NotImplementedError def train(self): """Full training logic.""" not_improved_count = 0 for epoch in range(self.start_epoch, self.epochs + 1): self.epoch = epoch epoch_start = time.time() logger.debug('Starting training epoch %s ...', str(epoch)) train_start = time.time() result = self._train_epoch(epoch) for key, val in result.items(): self.writer.add_scalar(f'{key}', val, epoch) self.timer.update('epoch.train', time.time() - train_start) logger.debug('Starting evaluating epoch %s ...', str(epoch)) valid_start = time.time() val_log = self._valid_epoch(epoch, sets='continuous_eval') logger.debug('Updating val log with results ...') result.update(val_log) self.timer.update('epoch.valid', time.time() - valid_start) checkpoint_start = time.time() # save logged informations into log dict log = {'epoch': epoch} for key, value in result.items(): # Metrics recorded during the continuous eval if key == 'metrics': for dataset_name, dataset_metrics in value.items(): for metric_type, metric_dict in dataset_metrics.items( ): for metric_name, metric_value in metric_dict.items( ): log[f'{dataset_name}/{metric_type}' f'/{metric_name}'] = metric_value else: log[key] = value # eval model according to configured metric, save best # ckpt as # trained_model. best = False if self.mnt_mode in ['min', 'max']: try: # check whether specified metric improved or not, according to # specified metric(mnt_metric) lower = log[self.mnt_metric] <= self.mnt_best higher = log[self.mnt_metric] >= self.mnt_best improved = (self.mnt_mode == 'min' and lower) or \ (self.mnt_mode == 'max' and higher) except KeyError: logger.warning( 'Warning: Metric %s not found, ' 'perf monitoring is disabled.', self.mnt_metric) self.mnt_mode = 'off' improved = False not_improved_count = 0 if improved: self.mnt_best = log[self.mnt_metric] not_improved_count = 0 best = True else: not_improved_count += 1 if not_improved_count > self.early_stop: logger.info( 'Val performance didn\'t improve for %s epochs. ' 'Training stops.', self.early_stop) break # If checkpointing is done intermittently, still save models that # outperform the best metric. save_best = best and self.mnt_metric != 'epoch' if self.mnt_mode in ['given_epoch'] and epoch == self.given_epoch: save_best = True # Due to the fast runtime/slow HDD combination, checkpointing can dominate # the total training time, so we optionally skip checkpoints for some of # the first epochs if epoch < self.skip_first_n_saves: msg = f'Skipping ckpt save at epoch {epoch} < {self.skip_first_n_saves}' logger.info(msg) elif epoch % self.save_period == 0 or save_best: self._save_checkpoint(epoch, save_best=best) if epoch > self.num_keep_ckpts: self.purge_stale_checkpoints() self.timer.update('epoch.checkpoint', time.time() - checkpoint_start) self.timer.update('epoch.total', time.time() - epoch_start) for key, val in self.timer.dic.items(): for metric in ['avg', 'sum']: log[f'timer.{key}.{metric}'] = self.timer.dic[key][metric] self.writer.add_scalar(f'timer_epoch/{key}', self.timer.dic[key]['sum'], epoch) self.writer.add_text('exp_dir', str(self.exp_dir), epoch) self.timer.reset() log['mnt_best'] = self.mnt_best log['not_improved_count'] = not_improved_count self.writer.add_scalar('mnt_best', self.mnt_best, epoch) # print results for metric_name, metric_value in log.items(): if '/cols' in metric_name: continue if 'timer.' in metric_name: logger.debug(' {:15s}: {}'.format(str(metric_name), metric_value)) else: logger.info(' {:15s}: {}'.format(str(metric_name), metric_value)) # Save main results in the perf log log_light = {} for key, value in log.items(): if not key.endswith('cols'): log_light[key] = value update_perf_log(log_light, self.perf_log_path) # Log results to Tensorboard self.writer.add_hparams(self.hparams, { 'hparam/accuracy': log[self.mnt_metric], 'hparam/mnt_best': self.mnt_best, 'hparam/epoch': epoch }, name='hparams') # # Ray-tune recording # try: # from ray.tune import track # acc = log[self.mnt_metric] # track.log(mean_accuracy=acc, exp_dir=self.exp_dir, **log_light) # except Exception as e: # print(e) def evaluate(self): """Final evaluation.""" sets = 'final_eval' ckpt_path = self.config.save_dir / 'trained_model.pth' if os.path.exists(ckpt_path): self._resume_checkpoint(ckpt_path) else: msg = ( f'The checkpoint {ckpt_path} does not exist and cannot be loaded. ' f'The model will not be resumed to that checkpoint.') logger.info(msg) final_result = self._valid_epoch(epoch=self.epoch, sets=sets) nested_metrics = final_result['metrics'] log = {} for dataset_name, dataset_metrics in nested_metrics.items(): log[dataset_name] = {} for metric_type, metric_dict in dataset_metrics.items(): for metric_name, metric_value in metric_dict.items(): log[dataset_name][ f'{metric_type}/{metric_name}/{sets}'] = metric_value # Print results for dataset_name, metric_dict in log.items(): logger.info('%s:', dataset_name) for metric_name, metric_value in metric_dict.items(): if '/cols' in metric_name: continue if 'timer.' in metric_name: logger.debug(' {:15s}: {}'.format(str(metric_name), metric_value)) else: logger.info(' {:15s}: {}'.format(str(metric_name), metric_value)) # Logging dataset perfs save_dir = self.config.save_dir results_on_datasets_log_path = os.path.join(save_dir, 'exp_results.json') if os.path.exists(results_on_datasets_log_path): with open(results_on_datasets_log_path) as json_file: res = json.load(json_file) else: res = collections.OrderedDict({}) if 'perfs' not in res.keys(): res['perfs'] = {} res['perfs'] = log res['checkpoint_epoch'] = self.loaded_epoch logger.info('Best epoch for the monitored metric: %s', self.loaded_epoch) with open(results_on_datasets_log_path, 'w') as fp: json.dump(res, fp, indent=4) exp_completed_flag_path = os.path.join(save_dir, 'exp_completed_flag.txt') # Touch the exp_completed_flag_path to mark that the experiment is completed with open(exp_completed_flag_path, 'a'): os.utime(exp_completed_flag_path, None) def test(self, sentence): """Final evaluation.""" sets = 'test' ckpt_path = self.config.save_dir / 'trained_model.pth' if os.path.exists(ckpt_path): self._resume_checkpoint(ckpt_path) else: msg = ( f'The checkpoint {ckpt_path} does not exist and cannot be loaded. ' f'The model will not be resumed to that checkpoint.') logger.info(msg) self.reading_from = "mult_h5" self.cache_dir = os.path.join(os.path.dirname(self.config.demo_dir), "vid_feat_files", self.reading_from) vid_list_path = "train_list_jsfusion.txt" vid_list_path = os.path.join(self.config.demo_dir, vid_list_path) self.sentence = sentence with open(vid_list_path) as f: vid_list = f.readlines() for i in range(len(vid_list)): vid = vid_list[i] output_basename = f"{vid[0]}/{vid[1]}/{vid[2]}/{vid}" output_basename = output_basename[:-1] + '.h5' dataset_file_path = os.path.join(self.cache_dir, output_basename) with h5py.File(dataset_file_path, "r+") as dataset_file: nb_captions = len([ k for k in dataset_file.keys() if k.startswith("raw_captions.") ]) for j in range(nb_captions): try: del dataset_file[f"raw_captions.{j}"] except: print(f"raw_captions.{j}" + "already deleted") dt = h5py.special_dtype(vlen=str) dataset_file.create_dataset(f"raw_captions.{j}", data=self.sentence, dtype=dt) final_result = self._valid_epoch(epoch=self.epoch, sets=sets) return final_result def purge_stale_checkpoints(self): """Remove checkpoints that are no longer neededself. NOTE: This function assumes that the `best` checkpoint has already been renamed to have a format that differs from `checkpoint-epoch<num>.pth` """ found_epoch_ckpts = list( self.checkpoint_dir.glob('checkpoint-epoch*.pth')) if len(found_epoch_ckpts) <= self.num_keep_ckpts: return # purge the oldest checkpoints regex = r'.*checkpoint-epoch(\d+)[.]pth$' epochs = [ int(re.search(regex, str(x)).groups()[0]) for x in found_epoch_ckpts ] sorted_ckpts = sorted(list(zip(epochs, found_epoch_ckpts)), key=lambda x: -x[0]) for epoch, stale_ckpt in sorted_ckpts[self.num_keep_ckpts:]: tic = time.time() stale_ckpt.unlink() msg = (f'removing stale ckpt [epoch {epoch}] ' f'[took {time.time() - tic:.2f}s]') logger.info(msg) def _prepare_device(self, n_gpu_use): """Setup GPU device if available, move model into configured device.""" n_gpu = torch.cuda.device_count() msg = f'n_gpu = torch.cuda.device_count(): {n_gpu} (nb of gpus available)' logger.debug(msg) if n_gpu_use > 0 and n_gpu == 0: logger.warning( 'Warning: There\'s no GPU available on this machine,' 'training will be performed on CPU.') n_gpu_use = 0 if n_gpu_use > n_gpu: msg = ('Warning: The number of GPU\'s configured to use is {}' ', but only {} are available ' 'on this machine.'.format(n_gpu_use, n_gpu)) logger.warning(msg) n_gpu_use = n_gpu device = torch.device('cuda:0' if n_gpu_use > 0 else 'cpu') logger.debug('device: %s', device) list_ids = list(range(n_gpu_use)) logger.debug('list_ids: %s', list_ids) return device, list_ids def _save_checkpoint(self, epoch, save_best=False): """Saving checkpoints.""" arch = type(self.model).__name__ # To accomodate the DataParallel model that adds the prefix "module" # to the parameters try: state_dict = self.model.module.state_dict() except AttributeError: state_dict = self.model.state_dict() state = { 'arch': arch, 'epoch': epoch, 'state_dict': state_dict, 'monitor_best': self.mnt_best, 'config': self.config, 'n_samples': self.n_samples, 'n_steps': self.n_steps, } if self.include_optim_in_ckpts: state['optimizer'] = self.optimizer.state_dict() state['lr_scheduler'] = self.lr_scheduler.state_dict() filename = str(self.checkpoint_dir / 'checkpoint-epoch{}.pth'.format(epoch)) filename_tmp = filename + '_' tic = time.time() logger.info('Saving checkpoint: %s ...', filename) torch.save(state, filename_tmp) os.rename(filename_tmp, filename) msg = f'Done in {time.time() - tic:.3f}s' logger.info(msg) if save_best: logger.info('Updating \'best\' checkpoint: %s ...', filename) best_path = str(self.checkpoint_dir / 'trained_model.pth') best_path_tmp = best_path + '_' torch.save(state, best_path_tmp) os.rename(best_path_tmp, best_path) msg = f'Done in {time.time() - tic:.3f}s' logger.info(msg) def _resume_last_checkpoint(self): checkpoint_path = get_last_checkpoint_path(self.exp_dir) self._resume_checkpoint(checkpoint_path) def match_checkpoint_to_model(self, checkpoint, model): """Adapt the loaded checkpoint so that is fits the current architecture.""" modules = ['vid_bert.embeddings.position_embeddings.weight'] for module in modules: if module in model and checkpoint[module].shape != model[ module].shape: padding = model[module].shape[0] - checkpoint[module].shape[0] padding_shape = list(model[module].shape) padding_shape[0] = padding device = checkpoint[module].device checkpoint[module] = torch.cat([ checkpoint[module], torch.zeros(padding_shape, device=device) ], 0) logger.warning( 'Size mismatch for module %s fixed by zero padding', module) def _resume_checkpoint(self, resume_path): """Resume from saved checkpoints.""" self.resume_path = str(resume_path) logger.info('Loading checkpoint from: %s ...', self.resume_path) checkpoint = torch.load(self.resume_path, map_location=self.device) self.loaded_epoch = checkpoint['epoch'] self.epoch = checkpoint['epoch'] self.start_epoch = checkpoint['epoch'] + 1 self.n_samples = checkpoint['n_samples'] self.n_steps = checkpoint['n_steps'] exp_dir_src = os.path.dirname(self.resume_path) restart = exp_dir_src == str(self.exp_dir) # load architecture params from checkpoint. if checkpoint['config']['arch'] != self.config['arch']: msg = ( 'Warning: Architecture configuration given in config file is' 'different from that of checkpoint. This may yield an exception' ' while state_dict is being loaded.') logger.warning(msg) logger.warning('Created model conf: %s', self.config['arch']) logger.warning('Loaded model conf: %s', checkpoint['config']['arch']) self.match_checkpoint_to_model(checkpoint['state_dict'], self.model.state_dict()) self.model.load_state_dict(checkpoint['state_dict'], strict=restart) if restart: # load optimizer state from ckpt only when optimizer type is not changed. optim_args = checkpoint['config']['optimizer'] if optim_args['type'] != self.config['optimizer']['type']: msg = ( 'Warning: Optimizer type given in config file differs from that' ' of checkpoint. Optimizer parameters not being resumed.') logger.warning(msg) else: self.optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler_args = checkpoint['config']['lr_scheduler'] if lr_scheduler_args['type'] != self.config['lr_scheduler']['type']: msg = ( 'Warning: Lr_scheduler type given in config file differs from that' ' of checkpoint. Lr_scheduler parameters not being resumed.' ) logger.warning(msg) else: self.lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) self.mnt_best = checkpoint['monitor_best'] else: self.loaded_epoch = 0 self.epoch = 0 self.start_epoch = 0 self.n_samples = 0 self.n_steps = 0 # Log the path of the checkpoint that was loaded with open(self.info_checkpoint_path, 'a') as f: f.write( f"This experiment is based on the checkpoint {self.resume_path}" f"loaded at epoch {checkpoint['epoch']}") logger.info('Ckpt loaded at epoch %s.', str(checkpoint['epoch']))
def train_model(args): logger.warn( "WARNING: TextAttack's model training feature is in beta. Please report any issues on our Github page, https://github.com/QData/TextAttack/issues." ) start_time = time.time() make_directories(args.output_dir) num_gpus = torch.cuda.device_count() # Save logger writes to file log_txt_path = os.path.join(args.output_dir, "log.txt") fh = logging.FileHandler(log_txt_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info(f"Writing logs to {log_txt_path}.") # Use Weights & Biases, if enabled. if args.enable_wandb: wandb.init(sync_tensorboard=True) # Get list of text and list of label (integers) from disk. train_text, train_labels, eval_text, eval_labels = dataset_from_args(args) # Filter labels if args.allowed_labels: logger.info( f"Filtering samples with labels outside of {args.allowed_labels}.") final_train_text, final_train_labels = [], [] for text, label in zip(train_text, train_labels): if label in args.allowed_labels: final_train_text.append(text) final_train_labels.append(label) logger.info( f"Filtered {len(train_text)} train samples to {len(final_train_text)} points." ) train_text, train_labels = final_train_text, final_train_labels final_eval_text, final_eval_labels = [], [] for text, label in zip(eval_text, eval_labels): if label in args.allowed_labels: final_eval_text.append(text) final_eval_labels.append(label) logger.info( f"Filtered {len(eval_text)} dev samples to {len(final_eval_text)} points." ) eval_text, eval_labels = final_eval_text, final_eval_labels label_id_len = len(train_labels) label_set = set(train_labels) args.num_labels = len(label_set) logger.info( f"Loaded dataset. Found: {args.num_labels} labels: ({sorted(label_set)})" ) if isinstance(train_labels[0], float): # TODO come up with a more sophisticated scheme for when to do regression logger.warn(f"Detected float labels. Doing regression.") args.num_labels = 1 args.do_regression = True else: args.do_regression = False train_examples_len = len(train_text) if len(train_labels) != train_examples_len: raise ValueError( f"Number of train examples ({train_examples_len}) does not match number of labels ({len(train_labels)})" ) if len(eval_labels) != len(eval_text): raise ValueError( f"Number of teste xamples ({len(eval_text)}) does not match number of labels ({len(eval_labels)})" ) model = model_from_args(args, args.num_labels) tokenizer = model.tokenizer logger.info(f"Tokenizing training data. (len: {train_examples_len})") train_text_ids = batch_encode(tokenizer, train_text) logger.info(f"Tokenizing eval data (len: {len(eval_labels)})") eval_text_ids = batch_encode(tokenizer, eval_text) load_time = time.time() logger.info(f"Loaded data and tokenized in {load_time-start_time}s") # multi-gpu training if num_gpus > 1: model = torch.nn.DataParallel(model) logger.info(f"Training model across {num_gpus} GPUs") num_train_optimization_steps = ( int(train_examples_len / args.batch_size / args.grad_accum_steps) * args.num_train_epochs) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = transformers.optimization.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = transformers.optimization.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_proportion, num_training_steps=num_train_optimization_steps, ) global_step = 0 # Start Tensorboard and log hyperparams. from tensorboardX import SummaryWriter tb_writer = SummaryWriter(args.output_dir) def is_writable_type(obj): for ok_type in [bool, int, str, float]: if isinstance(obj, ok_type): return True return False args_dict = {k: v for k, v in vars(args).items() if is_writable_type(v)} tb_writer.add_hparams(args_dict, {}) # Start training logger.info("***** Running training *****") logger.info(f"\tNum examples = {train_examples_len}") logger.info(f"\tBatch size = {args.batch_size}") logger.info(f"\tMax sequence length = {args.max_length}") logger.info(f"\tNum steps = {num_train_optimization_steps}") logger.info(f"\tNum epochs = {args.num_train_epochs}") logger.info(f"\tLearning rate = {args.learning_rate}") train_input_ids = np.array(train_text_ids) train_labels = np.array(train_labels) train_data = list( (ids, label) for ids, label in zip(train_input_ids, train_labels)) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) eval_input_ids = np.array(eval_text_ids) eval_labels = np.array(eval_labels) eval_data = list( (ids, label) for ids, label in zip(eval_input_ids, eval_labels)) eval_sampler = RandomSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) def get_eval_score(): model.eval() correct = 0 total = 0 logits = [] labels = [] for input_ids, batch_labels in eval_dataloader: if isinstance(input_ids, dict): ## HACK: dataloader collates dict backwards. This is a temporary # workaround to get ids in the right shape input_ids = { k: torch.stack(v).T.to(device) for k, v in input_ids.items() } batch_labels = batch_labels.to(device) with torch.no_grad(): batch_logits = textattack.shared.utils.model_predict( model, input_ids) logits.extend(batch_logits.cpu().squeeze().tolist()) labels.extend(batch_labels) model.train() logits = torch.tensor(logits) labels = torch.tensor(labels) if args.do_regression: pearson_correlation, pearson_p_value = scipy.stats.pearsonr( logits, labels) return pearson_correlation else: preds = logits.argmax(dim=1) correct = (preds == labels).sum() return float(correct) / len(labels) def save_model(): model_to_save = (model.module if hasattr(model, "module") else model ) # Only save the model itself # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, args.weights_name) output_config_file = os.path.join(args.output_dir, args.config_name) torch.save(model_to_save.state_dict(), output_model_file) try: model_to_save.config.to_json_file(output_config_file) except AttributeError: # no config pass global_step = 0 def save_model_checkpoint(): # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info(f"Checkpoint saved to {output_dir}.") model.train() args.best_eval_score = 0 args.best_eval_score_epoch = 0 args.epochs_since_best_eval_score = 0 def loss_backward(loss): if num_gpus > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.grad_accum_steps > 1: loss = loss / args.grad_accum_steps loss.backward() return loss for epoch in tqdm.trange(int(args.num_train_epochs), desc="Epoch", position=0, leave=False): prog_bar = tqdm.tqdm(train_dataloader, desc="Iteration", position=1, leave=False) for step, batch in enumerate(prog_bar): input_ids, labels = batch labels = labels.to(device) if isinstance(input_ids, dict): ## HACK: dataloader collates dict backwards. This is a temporary # workaround to get ids in the right shape input_ids = { k: torch.stack(v).T.to(device) for k, v in input_ids.items() } logits = textattack.shared.utils.model_predict(model, input_ids) if args.do_regression: # TODO integrate with textattack `metrics` package loss_fct = torch.nn.MSELoss() loss = loss_fct(logits.squeeze(), labels.squeeze()) else: loss_fct = torch.nn.CrossEntropyLoss() loss = loss_fct(logits, labels) loss = loss_backward(loss) if global_step % args.tb_writer_step == 0: tb_writer.add_scalar("loss", loss.item(), global_step) tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step) prog_bar.set_description(f"Loss {loss.item()}") if (step + 1) % args.grad_accum_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad() # Save model checkpoint to file. if (global_step > 0 and (args.checkpoint_steps > 0) and (global_step % args.checkpoint_steps) == 0): save_model_checkpoint() model.zero_grad() # Inc step counter. global_step += 1 # Check accuracy after each epoch. eval_score = get_eval_score() tb_writer.add_scalar("epoch_eval_score", eval_score, global_step) if args.checkpoint_every_epoch: save_model_checkpoint() logger.info( f"Eval {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%" ) if eval_score > args.best_eval_score: args.best_eval_score = eval_score args.best_eval_score_epoch = epoch args.epochs_since_best_eval_score = 0 save_model() logger.info(f"Best acc found. Saved model to {args.output_dir}.") else: args.epochs_since_best_eval_score += 1 if (args.early_stopping_epochs > 0) and (args.epochs_since_best_eval_score > args.early_stopping_epochs): logger.info( f"Stopping early since it's been {args.early_stopping_epochs} steps since validation acc increased" ) break # end of training, save tokenizer try: tokenizer.save_pretrained(args.output_dir) logger.info(f"Saved tokenizer {tokenizer} to {args.output_dir}.") except AttributeError: logger.warn( f"Error: could not save tokenizer {tokenizer} to {args.output_dir}." ) # Save a little readme with model info write_readme(args, args.best_eval_score, args.best_eval_score_epoch) # Save args to file args_save_path = os.path.join(args.output_dir, "train_args.json") final_args_dict = { k: v for k, v in vars(args).items() if is_writable_type(v) } with open(args_save_path, "w", encoding="utf-8") as f: f.write(json.dumps(final_args_dict, indent=2) + "\n") logger.info(f"Wrote training args to {args_save_path}.")
"translation max ratio": TRANSLATION_MAX_RATIO, "scale range": SCALE_RANGE, # training "num epochs": NUM_EPOCHS, "trainable stem": TRAINABLE_STEM, "train batch size": TRAIN_BATCH_SIZE, "batchnorm momentum": BATCHNORM_MOMENTUM, **OPT_INIT_PARAMS, "train HW": TRAIN_HW, "minival_gt_stddevs": MINIVAL_GT_STDDEVS, "val_gt_stddevs": VAL_GT_STDDEVS, "train_gt_stddevs": TRAIN_GT_STDDEVS, "distillation_alpha": DISTILLATION_ALPHA, **SCHEDULER_HYPERPARS} HPARS_DICT = {str(k): str(v) for k, v in HPARS_DICT.items()} tb_logger.add_hparams(HPARS_DICT, {}) txt_logger.info("HYPERPARAMETERS:\n{}".format(HPARS_DICT)) # INSTANTIATE OPTIMIZER DET_POS_WEIGHT = 100 # 100 means that black happens 100 more times than white det_loss_fn = DistillationBceLossKeypointMining(DET_POS_WEIGHT, DET_POS_WEIGHT, DEVICE) # att_loss_fn = torch.nn.BCELoss(pos_weight=torch.ones(1)*7).to(DEVICE) THIS SHOULD BE THE LOSS TO USE BUT DOESNT HAVE POS_WEIGHT AND THE OTHER WORKS AMD THE GPU IS BLOATED, SO WE KEEP WITH LOGITS ATM ALTHOUGH WE PROVIDE SIGMOID. att_loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=torch.ones(1)*7).to(DEVICE) # If stem is not trainable it already has torch.no_grad so opt won't train it params = (# list(student.mid_stem.parameters()) + list(student.att_lo.parameters()) + list(student.att_mid.parameters()) + list(student.att_hi.parameters()) + list(student.att_top.parameters())) att_opt = get_sgd_optimizer(params, half_precision=HALF_PRECISION,
def train_classifier(layer, batch_size, n_epochs, bottleneck, data_str, save_str): transform = transforms.Compose( [transforms.Grayscale(), transforms.ToTensor()]) dataset = datasets.ImageFolder(data_str + '/' + str(layer), transform=transform) device = 'cuda' writer = SummaryWriter() validation_split = 0.1 dataset_len = len(dataset) indices = list(range(dataset_len)) data_save_root = save_str + '/' + str(layer) + "/" if not os.path.exists(data_save_root): os.makedirs(data_save_root) # Randomly splitting indices: val_len = int(np.floor(validation_split * dataset_len)) validation_idx = np.random.choice(indices, size=val_len, replace=False) train_idx = list(set(indices) - set(validation_idx)) ## Defining the samplers for each phase based on the random indices: train_sampler = SubsetRandomSampler(train_idx) validation_sampler = SubsetRandomSampler(validation_idx) train_loader = torch.utils.data.DataLoader(dataset, sampler=train_sampler, batch_size=batch_size) validation_loader = torch.utils.data.DataLoader(dataset, sampler=validation_sampler, batch_size=batch_size) data_loaders = {"train": train_loader, "valid": validation_loader} data_lengths = {"train": len(train_idx), "valid": val_len} classifier = FeatureClassifier(layer, bottleneck).to(device) criterion = nn.CrossEntropyLoss() criterion.to(device) lr = 0.0001 optimizer = optim.Adam(classifier.parameters(), lr=0.0001) hparam_dict = { "Layer": layer, "batch size": batch_size, "Learning rate": lr } optimizer.zero_grad() writer.add_hparams(hparam_dict, {}) total_it = 0 for epoch in range(n_epochs): # Each epoch has a training and validation phase for phase in ['train', 'valid']: if phase == 'train': classifier.train(True) # Set model to training mode else: optimizer.zero_grad() running_loss = 0.0 epoch_it = 0 for image, label in data_loaders[phase]: classifier.zero_grad() optimizer.zero_grad() image = image.to(device) norm_image = (image - 0.5) * 2 label = label.to(device) vec, x_prob = classifier(norm_image) loss = criterion(x_prob, label) loss = loss.to(device) running_loss += loss.detach() if phase == 'train': print("layer: " + str(layer) + ", epoch: " + str(epoch) + ", step: " + str(epoch_it).zfill(6) + ", training loss: " + str(float(loss))) writer.add_scalar('data/train_loss_continous', loss, total_it) loss.backward() optimizer.step() total_it += 1 # optimizer = scheduler(optimizer, epoch) epoch_it += 1 epoch_loss = running_loss / data_lengths[phase] if phase == 'train': print("Epoch: " + str(epoch).zfill(6) + ", train loss: " + str(epoch_loss)) writer.add_scalar('data/train_loss_epoch', epoch_loss, epoch) if phase == 'valid': print("Epoch: " + str(epoch).zfill(6) + ", valid loss: " + str(epoch_loss)) writer.add_scalar('data/valid_loss_epoch', epoch_loss, epoch) if epoch % 10 == 0: torch.save( classifier.state_dict(), data_save_root + '/classifier' + str(layer) + '_' + str(epoch) + '.pt') torch.save(classifier.state_dict(), data_save_root + '/classifier' + str(layer) + '_final.pt') writer.export_scalars_to_json(data_save_root + "all_scalars.json") writer.close()
def train_model(args): logger.warn( "WARNING: TextAttack's model training feature is in beta. Please report any issues on our Github page, https://github.com/QData/TextAttack/issues." ) _make_directories(args.output_dir) num_gpus = torch.cuda.device_count() # Save logger writes to file log_txt_path = os.path.join(args.output_dir, "log.txt") fh = logging.FileHandler(log_txt_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info(f"Writing logs to {log_txt_path}.") # Use Weights & Biases, if enabled. if args.enable_wandb: global wandb import wandb wandb.init(sync_tensorboard=True) # Get list of text and list of label (integers) from disk. train_text, train_labels, eval_text, eval_labels = dataset_from_args(args) # Filter labels if args.allowed_labels: train_text, train_labels = _filter_labels(train_text, train_labels, args.allowed_labels) eval_text, eval_labels = _filter_labels(eval_text, eval_labels, args.allowed_labels) if args.pct_dataset < 1.0: logger.info(f"Using {args.pct_dataset*100}% of the training set") (train_text, train_labels), _ = _train_val_split(train_text, train_labels, split_val=1.0 - args.pct_dataset) train_examples_len = len(train_text) # data augmentation augmenter = augmenter_from_args(args) if augmenter: logger.info(f"Augmenting {len(train_text)} samples with {augmenter}") train_text, train_labels = _data_augmentation(train_text, train_labels, augmenter) # label_id_len = len(train_labels) label_set = set(train_labels) args.num_labels = len(label_set) logger.info( f"Loaded dataset. Found: {args.num_labels} labels: ({sorted(label_set)})" ) if isinstance(train_labels[0], float): # TODO come up with a more sophisticated scheme for knowing when to do regression logger.warn("Detected float labels. Doing regression.") args.num_labels = 1 args.do_regression = True else: args.do_regression = False if len(train_labels) != len(train_text): raise ValueError( f"Number of train examples ({len(train_text)}) does not match number of labels ({len(train_labels)})" ) if len(eval_labels) != len(eval_text): raise ValueError( f"Number of teste xamples ({len(eval_text)}) does not match number of labels ({len(eval_labels)})" ) model_wrapper = model_from_args(args, args.num_labels) model = model_wrapper.model tokenizer = model_wrapper.tokenizer attackCls = attack_from_args(args) # We are adversarial training if the user specified an attack along with # the training args. adversarial_training = attackCls is not None # multi-gpu training if num_gpus > 1: model = torch.nn.DataParallel(model) model.tokenizer = model.module.tokenizer logger.info("Using torch.nn.DataParallel.") logger.info(f"Training model across {num_gpus} GPUs") num_train_optimization_steps = ( int(train_examples_len / args.batch_size / args.grad_accum_steps) * args.num_train_epochs) if args.model == "lstm" or args.model == "cnn": def need_grad(x): return x.requires_grad optimizer = torch.optim.Adam(filter(need_grad, model.parameters()), lr=args.learning_rate) scheduler = None else: param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = transformers.optimization.AdamW( optimizer_grouped_parameters, lr=args.learning_rate) scheduler = transformers.optimization.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_proportion, num_training_steps=num_train_optimization_steps, ) # Start Tensorboard and log hyperparams. from tensorboardX import SummaryWriter tb_writer = SummaryWriter(args.output_dir) # Save original args to file args_save_path = os.path.join(args.output_dir, "train_args.json") _save_args(args, args_save_path) logger.info(f"Wrote original training args to {args_save_path}.") tb_writer.add_hparams( {k: v for k, v in vars(args).items() if _is_writable_type(v)}, {}) # Start training logger.info("***** Running training *****") if augmenter: logger.info(f"\tNum original examples = {train_examples_len}") logger.info(f"\tNum examples after augmentation = {len(train_text)}") else: logger.info(f"\tNum examples = {train_examples_len}") logger.info(f"\tBatch size = {args.batch_size}") logger.info(f"\tMax sequence length = {args.max_length}") logger.info(f"\tNum steps = {num_train_optimization_steps}") logger.info(f"\tNum epochs = {args.num_train_epochs}") logger.info(f"\tLearning rate = {args.learning_rate}") eval_dataloader = _make_dataloader(tokenizer, eval_text, eval_labels, args.batch_size) train_dataloader = _make_dataloader(tokenizer, train_text, train_labels, args.batch_size) global_step = 0 tr_loss = 0 model.train() args.best_eval_score = 0 args.best_eval_score_epoch = 0 args.epochs_since_best_eval_score = 0 def loss_backward(loss): if num_gpus > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.grad_accum_steps > 1: loss = loss / args.grad_accum_steps loss.backward() return loss if args.do_regression: # TODO integrate with textattack `metrics` package loss_fct = torch.nn.MSELoss() else: loss_fct = torch.nn.CrossEntropyLoss() for epoch in tqdm.trange(int(args.num_train_epochs), desc="Epoch", position=0, leave=False): if adversarial_training: if epoch >= args.num_clean_epochs: if (epoch - args.num_clean_epochs) % args.attack_period == 0: # only generate a new adversarial training set every args.attack_period epochs # after the clean epochs logger.info( "Attacking model to generate new training set...") adv_train_text = _generate_adversarial_examples( model, attackCls, list(zip(train_text, train_labels))) train_dataloader = _make_dataloader( tokenizer, adv_train_text, train_labels, args.batch_size) else: logger.info( f"Running clean epoch {epoch+1}/{args.num_clean_epochs}") prog_bar = tqdm.tqdm(train_dataloader, desc="Iteration", position=1, leave=False) for step, batch in enumerate(prog_bar): input_ids, labels = batch labels = labels.to(device) if isinstance(input_ids, dict): ## dataloader collates dict backwards. This is a workaround to get # ids in the right shape for HuggingFace models input_ids = { k: torch.stack(v).T.to(device) for k, v in input_ids.items() } logits = model(**input_ids)[0] else: input_ids = input_ids.to(device) logits = model(input_ids) if args.do_regression: # TODO integrate with textattack `metrics` package loss = loss_fct(logits.squeeze(), labels.squeeze()) else: loss = loss_fct(logits, labels) loss = loss_backward(loss) tr_loss += loss.item() if global_step % args.tb_writer_step == 0: tb_writer.add_scalar("loss", loss.item(), global_step) if scheduler is not None: tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step) else: tb_writer.add_scalar("lr", args.learning_rate, global_step) if global_step > 0: prog_bar.set_description(f"Loss {tr_loss/global_step}") if (step + 1) % args.grad_accum_steps == 0: optimizer.step() if scheduler is not None: scheduler.step() optimizer.zero_grad() # Save model checkpoint to file. if (global_step > 0 and (args.checkpoint_steps > 0) and (global_step % args.checkpoint_steps) == 0): _save_model_checkpoint(model, args.output_dir, global_step) # Inc step counter. global_step += 1 # Check accuracy after each epoch. # skip args.num_clean_epochs during adversarial training if not adversarial_training or epoch >= args.num_clean_epochs: eval_score = _get_eval_score(model, eval_dataloader, args.do_regression) tb_writer.add_scalar("epoch_eval_score", eval_score, global_step) if args.checkpoint_every_epoch: _save_model_checkpoint(model, args.output_dir, args.global_step) logger.info( f"Eval {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%" ) if eval_score > args.best_eval_score: args.best_eval_score = eval_score args.best_eval_score_epoch = epoch args.epochs_since_best_eval_score = 0 _save_model(model, args.output_dir, args.weights_name, args.config_name) logger.info( f"Best acc found. Saved model to {args.output_dir}.") _save_args(args, args_save_path) logger.info(f"Saved updated args to {args_save_path}") else: args.epochs_since_best_eval_score += 1 if (args.early_stopping_epochs > 0) and (args.epochs_since_best_eval_score > args.early_stopping_epochs): logger.info( f"Stopping early since it's been {args.early_stopping_epochs} steps since validation acc increased" ) break # read the saved model and report its eval performance logger.info( "Finished training. Re-loading and evaluating model from disk.") model_wrapper = model_from_args(args, args.num_labels) model = model_wrapper.model model.load_state_dict( torch.load(os.path.join(args.output_dir, args.weights_name))) eval_score = _get_eval_score(model, eval_dataloader, args.do_regression) logger.info( f"Saved model {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%" ) if args.save_last: _save_model(model, args.output_dir, args.weights_name, args.config_name) # end of training, save tokenizer try: tokenizer.save_pretrained(args.output_dir) logger.info(f"Saved tokenizer {tokenizer} to {args.output_dir}.") except AttributeError: logger.warn( f"Error: could not save tokenizer {tokenizer} to {args.output_dir}." ) # Save a little readme with model info write_readme(args, args.best_eval_score, args.best_eval_score_epoch) _save_args(args, args_save_path) logger.info(f"Wrote final training args to {args_save_path}.")
class CustomTensorBoardCallback(transformers.trainer_callback.TrainerCallback): """ A :class:`~transformers.TrainerCallback` that sends the logs to `TensorBoard <https://www.tensorflow.org/tensorboard>`__. Args: tb_writer (:obj:`SummaryWriter`, `optional`): The writer to use. Will instantiate one if not set. """ def __init__(self, tb_writer=None): self.tb_writer = tb_writer def _init_summary_writer(self, args, log_dir=None): log_dir = log_dir or args.logging_dir self.tb_writer = SummaryWriter(log_dir=log_dir) def on_train_begin(self, args, state, control, **kwargs): if not state.is_world_process_zero: return log_dir = None if state.is_hyper_param_search: trial_name = state.trial_name if trial_name is not None: log_dir = os.path.join(args.logging_dir, trial_name) self._init_summary_writer(args, log_dir) if self.tb_writer is not None: self.tb_writer.add_text("args", args.to_json_string()) if "model" in kwargs: model = kwargs["model"] if hasattr(model, "config") and model.config is not None: model_config_json = model.config.to_json_string() self.tb_writer.add_text("model_config", model_config_json) # Version of TensorBoard coming from tensorboardX does not have this method. if hasattr(self.tb_writer, "add_hparams"): self.tb_writer.add_hparams(args.to_sanitized_dict(), metric_dict={}) def on_log(self, args, state, control, logs=None, **kwargs): logs = rewrite_logs(logs) logs.update(get_system_info()) if state.is_world_process_zero: if self.tb_writer is None: self._init_summary_writer(args) if self.tb_writer: for k, v in logs.items(): if isinstance(v, (int, float)): self.tb_writer.add_scalar(k, v, state.global_step) else: logger.warning( "Trainer is attempting to log a value of " '"%s" of type %s for key "%s" as a scalar. ' "This invocation of Tensorboard's writer.add_scalar() " "is incorrect so we dropped this attribute.", v, type(v), k, ) self.tb_writer.flush() def on_train_end(self, args, state, control, **kwargs): if self.tb_writer: self.tb_writer.close()
class Logger(object): """Class that implements a logger of statistics. Parameters ---------- name: str Name of logger. This create a folder at runs/`name'. comment: str, optional. This is useful to separate equivalent runs. The folder is runs/`name'/`comment_date'. tensorboard: bool, optional. Flag that indicates whether or not to save the results in the tensorboard. """ def __init__(self, name, comment="", tensorboard=False): self.statistics = list() self.current = dict() self.all = defaultdict(list) now = datetime.now() current_time = now.strftime("%b%d_%H-%M-%S") comment = comment + "_" + current_time if len(comment) else current_time log_dir = f"runs/{name}/{comment}" if tensorboard: self.writer = SummaryWriter(log_dir=log_dir) self.log_dir = self.writer.logdir else: self.writer = None self.log_dir = safe_make_dir(log_dir) self.episode = 0 self.keys = set() def __len__(self): """Return the number of episodes.""" return len(self.statistics) def __iter__(self): """Iterate over the episode statistics.""" return self.statistics def __getitem__(self, index): """Return a specific episode.""" return self.statistics[index] def __str__(self): """Return parameter string of logger.""" str_ = "" for key in sorted(self.keys): values = self.get(key) str_ += " ".join(key.split("_")).title().ljust(17) str_ += f"Last: {values[-1]:.2g}".ljust(15) str_ += f"Avg: {np.mean(values):.2g}".ljust(15) str_ += f"MAvg: {np.mean(values[-10:]):.2g}".ljust(15) str_ += f"Range: ({np.min(values):.2g},{np.max(values):.2g})\n" return str_ def get(self, key): """Return the statistics of a specific key. It collects all end-of-episode data stored in statistic and returns a list with such values. """ return [statistic[key] for statistic in self.statistics if key in statistic] def update(self, **kwargs): """Update the statistics for the current episode. Parameters ---------- kwargs: dict Any kwargs passed to update is converted to numpy and averaged over the course of an episode. """ for key, value in kwargs.items(): self.keys.add(key) if isinstance(value, torch.Tensor): value = value.detach().numpy() value = np.nan_to_num(value) if isinstance(value, np.ndarray): value = float(np.mean(value)) if isinstance(value, np.float32): value = float(value) if isinstance(value, np.int64): value = int(value) if key not in self.current: self.current[key] = (1, value) else: count, old_value = self.current[key] new_count = count + 1 new_value = old_value + (value - old_value) * (1 / new_count) self.current[key] = (new_count, new_value) self.all[key].append(value) if self.writer is not None: self.writer.add_scalar( f"episode_{self.episode}/{key}", self.current[key][1], global_step=self.current[key][0], ) def end_episode(self, **kwargs): """Finalize collected data and add final fixed values. Parameters ---------- kwargs : dict Any kwargs passed to end_episode overwrites tracked data if present. This can be used to store fixed values that are tracked per episode and do not need to be averaged. """ data = {key: value[1] for key, value in self.current.items()} kwargs = {key: value for key, value in kwargs.items()} data.update(kwargs) for key, value in data.items(): self.keys.add(key) if isinstance(value, float) or isinstance(value, int): self.all[key].append(value) if self.writer is not None: self.writer.add_scalar( f"average/{key}", value, global_step=self.episode ) self.statistics.append(data) self.current = dict() self.episode += 1 def save_hparams(self, hparams): """Save hparams to a json file.""" with open(f"{self.log_dir}/hparams.json", "w") as f: json.dump(hparams, f) def export_to_json(self): """Save the statistics to a json file.""" with open(f"{self.log_dir}/statistics.json", "w") as f: json.dump(self.statistics, f) with open(f"{self.log_dir}/all.json", "w") as f: json.dump(self.all, f) def load_from_json(self, log_dir=None): """Load the statistics from a json file.""" log_dir = log_dir if log_dir is not None else self.log_dir with open(f"{log_dir}/statistics.json", "r") as f: self.statistics = json.load(f) with open(f"{log_dir}/all.json", "r") as f: self.all = json.load(f) for key in self.all.keys(): self.keys.add(key) def log_hparams(self, hparams, metrics=None): """Log hyper parameters together with a metric dictionary.""" if self.writer is None: # Do not save. return for k, v in hparams.items(): if v is None: hparams[k] = 0 self.writer.add_hparams( hparam_dict=hparams, metric_dict=metrics, name="hparams", global_step=1 ) def delete_directory(self): """Delete writer directory. Notes ----- Use with caution. This will erase the directory, not the object. """ shutil.rmtree(self.log_dir) def change_log_dir(self, new_log_dir): """Change log directory.""" log_dir = f"runs/{new_log_dir}" try: self.delete_directory() except FileNotFoundError: pass if self.writer is not None: self.writer = SummaryWriter(log_dir=log_dir) self.log_dir = self.writer.logdir else: self.writer = None self.log_dir = safe_make_dir(log_dir) try: self.load_from_json() # If json files in log_dir, then load them. except FileNotFoundError: pass
class Logger(BaseLogger): """Logger class that writes to tensorboardX.""" def __init__(self, config: ConfigType): """Initialise the tensorboardX Logger. Args: config (ConfigType): config to initialise the tensorboardX logger. The config can have any parameters that tensorboardX.SummaryWriter() method accepts (https://tensorboardx.readthedocs.io/en/latest/tensorboard.html#tensorboardX.SummaryWriter). Note that the config is passed as keyword arguments to the tensorboardX.SummaryWriter() method. This provides a lot of flexibility to the users to configure tensorboard. This also means that config should not have any parameters that tensorboardX.SummaryWriter() would not accept. """ super().__init__(config=config) key = "logdir" if key in config and config[key] is not None: make_dir(config[key]) self.summary_writer = SummaryWriter(**config) self.keys_to_skip = ["logbook_id", "logbook_type", "logbook_timestamp"] def write(self, log: LogType) -> None: """Write the log to tensorboard. Args: log (LogType): Log to write """ logbook_type = log["logbook_type"] if logbook_type == "metric": log = self._prepare_metric_log_to_write(log=log) self.write_metric(metric=log) else: if logbook_type == "config": self.write_config(config=log) # Only metric logs and message logs are supported right now def write_metric(self, metric: MetricType) -> None: """Write metric to tensorboard. Args: metric (MetricType): Metric to write """ global_step = None if "global_step" in metric: global_step = metric.pop("global_step") walltime = None if "walltime" in metric: walltime = metric.pop("walltime") main_tag = "" if "tag" in metric: main_tag = str(metric.pop("tag")) + "/" elif "main_tag" in metric: main_tag = str(metric.pop("main_tag")) + "/" if self.key_prefix: prefix = {metric.pop(self.key_prefix)} metric = { f"{prefix}_{key}": value for key, value in metric.items() } for key, value in metric.items(): self.summary_writer.add_scalar( tag=f"{main_tag}{key}", scalar_value=value, global_step=global_step, walltime=walltime, ) def write_config(self, config: ConfigType) -> None: """Write the config to tensorboard. Args: config (ConfigType): Config to write """ name = None if "name" in config: name = config.pop("name") metric_dict: Dict[str, NumType] = {} if "metric_dict" in config: metric_dict = config.pop("metric_dict") metric_dict = self._prepare_metric_log_to_write(log=metric_dict) global_step = None if "global_step" in config: global_step = config.pop("global_step") config = self._prepare_log_to_write(log=config) for key in config: if config[key] is None: config[key] = "None" self.summary_writer.add_hparams( hparam_dict=flatten_dict(config), metric_dict=metric_dict, name=name, global_step=global_step, )
# validation_logger = SummaryWriter(log_dir = os.path.join(args.save, 'validation'), comment = 'validation') log_subpath = f'log/' unified_logger = SummaryWriter( log_dir=os.path.join(args.save, log_subpath), comment=f'{args.model}_{args.optimizer}_{args.loss}') hyper_param_dict = { 'lr': args.optimizer_lr, 'bsize': args.batch_size, 'epochs': args.total_epochs, 'sched_frac': args.schedule_lr_fraction, 'sched_freq': args.schedule_lr_frequency, # 'eps': args.optimizer_eps } metric_param_dict = {} unified_logger.add_hparams(hyper_param_dict, metric_param_dict) # unified_logger.add_text('train/summary', args.model) # unified_logger.add_text('train/summary', args.optimizer) # unified_logger.add_text('train/summary', args.loss) # Dynamically load the optimizer with parameters passed in via "--optimizer_[param]=[value]" arguments with tools.TimerBlock("Initializing {} Optimizer".format( args.optimizer)) as block: kwargs = tools.kwargs_from_args(args, 'optimizer') if args.fp16: optimizer = args.optimizer_class( [p for p in param_copy if p.requires_grad], **kwargs) else: optimizer = args.optimizer_class( [p for p in model_and_loss.parameters() if p.requires_grad], **kwargs) for param, default in list(kwargs.items()):
def model_train_dev(self, train_dataset: Dataset, dev_dataset: Dataset, model_dir: str, epoch_dev_eval: bool = False, **kwargs): log_dir = os.path.join(model_dir, 'log') if self.cur_epoch == 0: shutil.rmtree(log_dir, ignore_errors=True) writer = SummaryWriter(log_dir, flush_secs=60) num_workers = kwargs.get('num_workers', 8) dt_train_src, dt_train_tgt = train_dataset dl_train_src = DataLoader(dt_train_src, batch_size=self.batch_size, shuffle=True, num_workers=num_workers) dl_train_tgt = DataLoader(dt_train_tgt, batch_size=self.batch_size, shuffle=True, num_workers=num_workers) dev_eval_dl = DataLoader(dev_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4) self.core_model.train() for epoch in range(self.cur_epoch + 1, self.epoch_s1 + self.epoch_s2 + 1): epoch_start_time = time.time() self.cur_epoch = epoch running_losses = [] if epoch <= self.epoch_s1: for batch_data in dl_train_src: losses = self.core_model.batch_fit(batch=batch_data, epoch=epoch) running_losses.append(losses) else: for batch_data in dl_train_tgt: losses = self.core_model.batch_fit(batch=batch_data, epoch=epoch) running_losses.append(losses) show_loss = pd.DataFrame(running_losses).mean().to_dict() print(f"[epoch: {epoch:03d}/{self.epochs}, %s" % (', '.join( [f'{loss}: {value:.5f}' for loss, value in show_loss.items()]))) for loss_name, loss in show_loss.items(): writer.add_scalar(tag=f'loss_train/{loss_name}', scalar_value=loss, global_step=epoch) running_losses = pd.DataFrame(running_losses).mean().to_dict() if epoch_dev_eval and self.cur_epoch % self.eval_epoch_freq == 0: results_dev = [] self.core_model.eval() with torch.no_grad(): for batch_data in dev_eval_dl: batch_result = self.core_model.batch_predict( batch_data) results_dev.append(batch_result) self.core_model.train() results_dev = pd.DataFrame(results_dev).to_dict(orient='list') results_dev = { k: np.concatenate(v) for k, v in results_dev.items() } metrics_dev = yield results_dev print( f"\t dev performance of epoch {epoch}: {[f'{k}:{v:.3f}' for k, v in metrics_dev.items()]}" ) writer.add_hparams(hparam_dict={'set': 'val'}, metric_dict={ 'metric/' + k: v for k, v in metrics_dev.items() }, name='metric_val', global_step=epoch) else: metrics_dev = {} self.core_model.after_one_epoch(writer=writer, epoch=epoch, losses=running_losses, metrics=metrics_dev) if self.cur_epoch % self.save_epoch_freq == 0: self.save_model(model_dir=model_dir, epoch=self.cur_epoch) print('\t Epoch: %03d Time Taken: %d sec' % (self.cur_epoch, time.time() - epoch_start_time)) writer.close()
def train_model( name="", resume="", base_dir=utils.BASE_DIR, model_name="v0", chosen_diseases=None, n_epochs=10, batch_size=4, oversample=False, max_os=None, shuffle=False, opt="sgd", opt_params={}, loss_name="wbce", loss_params={}, train_resnet=False, log_metrics=None, flush_secs=120, train_max_images=None, val_max_images=None, test_max_images=None, experiment_mode="debug", save=True, save_cms=True, # Note that in this case, save_cms (to disk) includes write_cms (to TB) write_graph=False, write_emb=False, write_emb_img=False, write_img=False, image_format="RGB", multiple_gpu=False, ): # Choose GPU device = utilsT.get_torch_device() print("Using device: ", device) # Common folders dataset_dir = os.path.join(base_dir, "dataset") # Dataset handling print("Loading train dataset...") train_dataset, train_dataloader = utilsT.prepare_data( dataset_dir, "train", chosen_diseases, batch_size, oversample=oversample, max_os=max_os, shuffle=shuffle, max_images=train_max_images, image_format=image_format, ) train_samples, _ = train_dataset.size() print("Loading val dataset...") val_dataset, val_dataloader = utilsT.prepare_data( dataset_dir, "val", chosen_diseases, batch_size, max_images=val_max_images, image_format=image_format, ) val_samples, _ = val_dataset.size() # Should be the same than chosen_diseases chosen_diseases = list(train_dataset.classes) print("Chosen diseases: ", chosen_diseases) if resume: # Load model and optimizer model, model_name, optimizer, opt, loss_name, loss_params, chosen_diseases = models.load_model( base_dir, resume, experiment_mode="", device=device) model.train(True) else: # Create model model = models.init_empty_model(model_name, chosen_diseases, train_resnet=train_resnet).to(device) # Create optimizer OptClass = optimizers.get_optimizer_class(opt) optimizer = OptClass(model.parameters(), **opt_params) # print("OPT: ", opt_params) # Allow multiple GPUs if multiple_gpu: model = DataParallel(model) # Tensorboard log options run_name = utils.get_timestamp() if name: run_name += "_{}".format(name) if len(chosen_diseases) == 1: run_name += "_{}".format(chosen_diseases[0]) elif len(chosen_diseases) == 14: run_name += "_all" log_dir = get_log_dir(base_dir, run_name, experiment_mode=experiment_mode) print("Run name: ", run_name) print("Saved TB in: ", log_dir) writer = SummaryWriter(log_dir=log_dir, flush_secs=flush_secs) # Create validator engine validator = Engine( utilsT.get_step_fn(model, optimizer, device, loss_name, loss_params, False)) val_loss = RunningAverage(output_transform=lambda x: x[0], alpha=1) val_loss.attach(validator, loss_name) utilsT.attach_metrics(validator, chosen_diseases, "prec", Precision, True) utilsT.attach_metrics(validator, chosen_diseases, "recall", Recall, True) utilsT.attach_metrics(validator, chosen_diseases, "acc", Accuracy, True) utilsT.attach_metrics(validator, chosen_diseases, "roc_auc", utilsT.RocAucMetric, False) utilsT.attach_metrics(validator, chosen_diseases, "cm", ConfusionMatrix, get_transform_fn=utilsT.get_transform_cm, metric_args=(2, )) utilsT.attach_metrics(validator, chosen_diseases, "positives", RunningAverage, get_transform_fn=utilsT.get_count_positives) # Create trainer engine trainer = Engine( utilsT.get_step_fn(model, optimizer, device, loss_name, loss_params, True)) train_loss = RunningAverage(output_transform=lambda x: x[0], alpha=1) train_loss.attach(trainer, loss_name) utilsT.attach_metrics(trainer, chosen_diseases, "acc", Accuracy, True) utilsT.attach_metrics(trainer, chosen_diseases, "prec", Precision, True) utilsT.attach_metrics(trainer, chosen_diseases, "recall", Recall, True) utilsT.attach_metrics(trainer, chosen_diseases, "roc_auc", utilsT.RocAucMetric, False) utilsT.attach_metrics(trainer, chosen_diseases, "cm", ConfusionMatrix, get_transform_fn=utilsT.get_transform_cm, metric_args=(2, )) utilsT.attach_metrics(trainer, chosen_diseases, "positives", RunningAverage, get_transform_fn=utilsT.get_count_positives) timer = Timer(average=True) timer.attach(trainer, start=Events.EPOCH_STARTED, step=Events.EPOCH_COMPLETED) # TODO: Early stopping # def score_function(engine): # val_loss = engine.state.metrics[loss_name] # return -val_loss # handler = EarlyStopping(patience=10, score_function=score_function, trainer=trainer) # validator.add_event_handler(Events.COMPLETED, handler) # Metrics callbacks if log_metrics is None: log_metrics = list(ALL_METRICS) def _write_metrics(run_type, metrics, epoch, wall_time): loss = metrics.get(loss_name, 0) writer.add_scalar("Loss/" + run_type, loss, epoch, wall_time) for metric_base_name in log_metrics: for disease in chosen_diseases: metric_value = metrics.get( "{}_{}".format(metric_base_name, disease), -1) writer.add_scalar( "{}_{}/{}".format(metric_base_name, disease, run_type), metric_value, epoch, wall_time) @trainer.on(Events.EPOCH_COMPLETED) def tb_write_metrics(trainer): epoch = trainer.state.epoch max_epochs = trainer.state.max_epochs # Run on evaluation validator.run(val_dataloader, 1) # Common time wall_time = time.time() # Log all metrics to TB _write_metrics("train", trainer.state.metrics, epoch, wall_time) _write_metrics("val", validator.state.metrics, epoch, wall_time) train_loss = trainer.state.metrics.get(loss_name, 0) val_loss = validator.state.metrics.get(loss_name, 0) tb_write_histogram(writer, model, epoch, wall_time) print("Finished epoch {}/{}, loss {:.3f}, val loss {:.3f} (took {})". format(epoch, max_epochs, train_loss, val_loss, utils.duration_to_str(int(timer._elapsed())))) # Hparam dict hparam_dict = { "resume": resume, "n_diseases": len(chosen_diseases), "diseases": ",".join(chosen_diseases), "n_epochs": n_epochs, "batch_size": batch_size, "shuffle": shuffle, "model_name": model_name, "opt": opt, "loss": loss_name, "samples (train, val)": "{},{}".format(train_samples, val_samples), "train_resnet": train_resnet, "multiple_gpu": multiple_gpu, } def copy_params(params_dict, base_name): for name, value in params_dict.items(): hparam_dict["{}_{}".format(base_name, name)] = value copy_params(loss_params, "loss") copy_params(opt_params, "opt") print("HPARAM: ", hparam_dict) # Train print("-" * 50) print("Training...") trainer.run(train_dataloader, n_epochs) # Capture time secs_per_epoch = timer.value() duration_per_epoch = utils.duration_to_str(int(secs_per_epoch)) print("Average time per epoch: ", duration_per_epoch) print("-" * 50) ## Write all hparams hparam_dict["duration_per_epoch"] = duration_per_epoch # FIXME: this is commented to avoid having too many hparams in TB frontend # metrics # def copy_metrics(engine, engine_name): # for metric_name, metric_value in engine.state.metrics.items(): # hparam_dict["{}_{}".format(engine_name, metric_name)] = metric_value # copy_metrics(trainer, "train") # copy_metrics(validator, "val") print("Writing TB hparams") writer.add_hparams(hparam_dict, {}) # Save model to disk if save: print("Saving model...") models.save_model(base_dir, run_name, model_name, experiment_mode, hparam_dict, trainer, model, optimizer) # Write graph to TB if write_graph: print("Writing TB graph...") tb_write_graph(writer, model, train_dataloader, device) # Write embeddings to TB if write_emb: print("Writing TB embeddings...") image_size = 256 if write_emb_img else 0 # FIXME: be able to select images (balanced, train vs val, etc) image_list = list(train_dataset.label_index["FileName"])[:1000] # disease = chosen_diseases[0] # positive = train_dataset.label_index[train_dataset.label_index[disease] == 1] # negative = train_dataset.label_index[train_dataset.label_index[disease] == 0] # positive_images = list(positive["FileName"])[:25] # negative_images = list(negative["FileName"])[:25] # image_list = positive_images + negative_images all_images, all_embeddings, all_predictions, all_ground_truths = gen_embeddings( model, train_dataset, device, image_list=image_list, image_size=image_size) tb_write_embeddings( writer, chosen_diseases, all_images, all_embeddings, all_predictions, all_ground_truths, global_step=n_epochs, use_images=write_emb_img, tag="1000_{}".format("img" if write_emb_img else "no_img"), ) # Save confusion matrices (is expensive to calculate them afterwards) if save_cms: print("Saving confusion matrices...") # Assure folder cms_dir = os.path.join(base_dir, "cms", experiment_mode) os.makedirs(cms_dir, exist_ok=True) base_fname = os.path.join(cms_dir, run_name) n_diseases = len(chosen_diseases) def extract_cms(metrics): """Extract confusion matrices from a metrics dict.""" cms = [] for disease in chosen_diseases: key = "cm_" + disease if key not in metrics: cm = np.array([[-1, -1], [-1, -1]]) else: cm = metrics[key].numpy() cms.append(cm) return np.array(cms) # Train confusion matrix train_cms = extract_cms(trainer.state.metrics) np.save(base_fname + "_train", train_cms) tb_write_cms(writer, "train", chosen_diseases, train_cms) # Validation confusion matrix val_cms = extract_cms(validator.state.metrics) np.save(base_fname + "_val", val_cms) tb_write_cms(writer, "val", chosen_diseases, val_cms) # All confusion matrix (train + val) all_cms = train_cms + val_cms np.save(base_fname + "_all", all_cms) # Print to console if len(chosen_diseases) == 1: print("Train CM: ") print(train_cms[0]) print("Val CM: ") print(val_cms[0]) # print("Train CM 2: ") # print(trainer.state.metrics["cm_" + chosen_diseases[0]]) # print("Val CM 2: ") # print(validator.state.metrics["cm_" + chosen_diseases[0]]) if write_img: # NOTE: this option is not recommended, use Testing notebook to plot and analyze images print("Writing images to TB...") test_dataset, test_dataloader = utilsT.prepare_data( dataset_dir, "test", chosen_diseases, batch_size, max_images=test_max_images, ) # TODO: add a way to select images? # image_list = list(test_dataset.label_index["FileName"])[:3] # Examples in test_dataset (with bboxes available): image_list = [ # "00010277_000.png", # (Effusion, Infiltrate, Mass, Pneumonia) # "00018427_004.png", # (Atelectasis, Effusion, Mass) # "00021703_001.png", # (Atelectasis, Effusion, Infiltrate) # "00028640_008.png", # (Effusion, Infiltrate) # "00019124_104.png", # (Pneumothorax) # "00019124_090.png", # (Nodule) # "00020318_007.png", # (Pneumothorax) "00000003_000.png", # (0) # "00000003_001.png", # (0) # "00000003_002.png", # (0) "00000732_005.png", # (Cardiomegaly, Pneumothorax) # "00012261_001.png", # (Cardiomegaly, Pneumonia) # "00013249_033.png", # (Cardiomegaly, Pneumonia) # "00029808_003.png", # (Cardiomegaly, Pneumonia) # "00022215_012.png", # (Cardiomegaly, Pneumonia) # "00011402_007.png", # (Cardiomegaly, Pneumonia) # "00019018_007.png", # (Cardiomegaly, Infiltrate) # "00021009_001.png", # (Cardiomegaly, Infiltrate) # "00013670_151.png", # (Cardiomegaly, Infiltrate) # "00005066_030.png", # (Cardiomegaly, Infiltrate, Effusion) "00012288_000.png", # (Cardiomegaly) "00008399_007.png", # (Cardiomegaly) "00005532_000.png", # (Cardiomegaly) "00005532_014.png", # (Cardiomegaly) "00005532_016.png", # (Cardiomegaly) "00005827_000.png", # (Cardiomegaly) # "00006912_007.png", # (Cardiomegaly) # "00007037_000.png", # (Cardiomegaly) # "00007043_000.png", # (Cardiomegaly) # "00012741_004.png", # (Cardiomegaly) # "00007551_020.png", # (Cardiomegaly) # "00007735_040.png", # (Cardiomegaly) # "00008339_010.png", # (Cardiomegaly) # "00008365_000.png", # (Cardiomegaly) # "00012686_003.png", # (Cardiomegaly) ] tb_write_images(writer, model, test_dataset, chosen_diseases, n_epochs, device, image_list) # Close TB writer if experiment_mode != "debug": writer.close() # Run post_train print("-" * 50) print("Running post_train...") print("Loading test dataset...") test_dataset, test_dataloader = utilsT.prepare_data( dataset_dir, "test", chosen_diseases, batch_size, max_images=test_max_images) save_cms_with_names(run_name, experiment_mode, model, test_dataset, test_dataloader, chosen_diseases) evaluate_model(run_name, model, optimizer, device, loss_name, loss_params, chosen_diseases, test_dataloader, experiment_mode=experiment_mode, base_dir=base_dir) # Return values for debugging model_run = ModelRun(model, run_name, model_name, chosen_diseases) if experiment_mode == "debug": model_run.save_debug_data(writer, trainer, validator, train_dataset, train_dataloader, val_dataset, val_dataloader) return model_run
# ----------------------------------------Written by Luc Hayward------------------------------------------------------ # # Load the best saved model. model_load(args.save) print('Loaded best saved model') # Added final evaluation on the validation and test sets using best saved model (regardless of the effect of # over-fitting the training data, will reload the last best validation model). # Logs the validation score to the hparams tensorboard log to allow for easy comparisons of the different parameter # tuning experiments. Test values specifically not logged to prevent tuning on test results. if args.log_hparams_only: stored_loss = evaluate(val_data, eval_batch_size) writer.add_hparams( args.__dict__, { 'hparam/val_loss': stored_loss, 'hparam/val_bpc': stored_loss / math.log(2) / corpus.dictionary.avg_characters_per_token.get('valid') }) print("Evaluating on test data...") # Run on test data. test_loss = evaluate(test_data, test_batch_size) print('=' * 89) print( '| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}' .format( test_loss, math.exp(test_loss), test_loss / math.log(2) / corpus.dictionary.avg_characters_per_token.get('test')))
class Trainer: def __init__(self, exp_name, model_name, dls, hp, bs, sched=False): self.device = torch.device("cuda" if IS_CUDA else "cpu") self.model = all_models[hp["model"]](bs).to(self.device) self.loss = all_loss[hp["loss"]] self.epochs = hp["epochs"] self.writer = SummaryWriter( os.path.join(LOG_DIR, exp_name, model_name)) self.exp_name = exp_name self.hp = hp self.metrics = {} for p in ["train", "val", "test"]: self.metrics[p] = [[], [], []] self.dls = dls self.steps = [0] * 3 self.batch_size = dls[0].batch_size self.model_name = model_name opt = all_opt[hp["opt"]] parameters = filter(lambda p: p.requires_grad, self.model.parameters()) if hp["opt"] == "ADAM": self.opt = opt(params=parameters, lr=hp["lr"]) else: self.opt = opt( params=parameters, lr=hp["lr"], momentum=0.9, weight_decay=hp["wd"], ) self.scheduler = torch.optim.lr_scheduler.StepLR( self.opt, step_size=2, gamma=0.1 if sched else 1 ) self.isTransformer = dls[0].dataset.tokenizer is not None def anEpoch(self, phaseIndex, toLog=True): phaseName = PHASES[phaseIndex] losses = [] acc_count = 0 allPreds, allLabels = [], [] # we use tqdm to provide visual feedback on training stage for xb, yb in tqdm(self.dls[phaseIndex], total=len(self.dls[phaseIndex])): if self.isTransformer: inputIds, mask = xb yb = yb.to(self.device) outputs = self.model( inputIds.to(self.device), attention_mask=mask.to(self.device), labels=yb, ) loss = outputs[0] output = outputs[1] inputIds.detach().cpu() mask.detach().cpu() yb.detach().cpu() else: # BATCH_SIZE, 3, 224, 224 xb = (xb[0].to(self.device), xb[1].cpu()) yb = yb.to(self.device) # BATCH_SIZE, 1 output = self.model(xb) # BATCH_SIZE, 3 loss = self.loss(output, yb) xb[0].detach().cpu() yb.detach().cpu() allPreds.append(torch.argmax(output, dim=1).cpu()) allLabels.append(yb.cpu()) acc_count += accuracy(output, yb) losses.append(loss) self.steps[phaseIndex] += 1 if toLog: self._log("{}_loss".format(phaseName), loss, self.steps[phaseIndex]) if phaseIndex == 0: self.opt.zero_grad() loss.backward() # calculates gradient descent self.opt.step() # updates model parameters allPreds = torch.cat(allPreds) allLabels = torch.cat(allLabels) f1Score = skMetrics.f1_score( allLabels.cpu(), allPreds.cpu(), average="macro") losses = torch.stack(losses) epoch_loss = losses.mean().item() epoch_acc = acc_count / len(self.dls[phaseIndex]) / self.batch_size self.metrics[phaseName][0].append(epoch_loss) self.metrics[phaseName][1].append(epoch_acc) self.metrics[phaseName][2].append(f1Score) print( "\nepoch {} info: loss:{}, acc:{}, f1Score:{}".format( phaseName, epoch_loss, epoch_acc, f1Score ) ) return allPreds, allLabels def topKLoss(self, phaseIndex, k): lossValues = [] # we use tqdm to provide visual feedback on training stage with torch.no_grad(): for xb, yb in tqdm(self.dls[phaseIndex], total=len(self.dls[phaseIndex])): if self.isTransformer: inputIds, mask = xb yb = yb.to(self.device) outputs = self.model( inputIds.to(self.device), attention_mask=mask.to(self.device), labels=yb, ) output = outputs[1] inputIds.detach().cpu() mask.detach().cpu() yb.detach().cpu() else: xb = xb.to(self.device) # BATCH_SIZE, 3, 224, 224 yb = yb.to(self.device) # BATCH_SIZE, 1 output = self.model(xb) # BATCH_SIZE, 3 xb.detach().cpu() yb.detach().cpu() lossValues.append(cross_entropy( output, yb, reduction='none').cpu()) lossValues = torch.cat(lossValues) return torch.topk(lossValues, k=k) def one_cycle(self): # self.freeze() for i in range(self.epochs): print("epoch number: {}".format(i)) self.model.train() self.anEpoch(0) with torch.no_grad(): self.model.eval() self.anEpoch(1) self.scheduler.step() self._save_weights() self.load_weights(self.model_name + ".pkl") if len(self.dls) > 2 and len(self.dls[2]) > 0: with torch.no_grad(): self.model.eval() self.anEpoch(2) metrics = {} for i in range(3): metrics.update(self.getMetrics(i)) self._write_hp(metrics) # for comparing between experiments def freeze(self, toTrain=False): if self.isTransformer: for param in self.model.base_model.parameters(): param.requires_grad = toTrain return for p in self.model.embedding.parameters(): p.requires_grad = toTrain for p in self.model.lstm.parameters(): p.requires_grad = toTrain def getMetrics(self, type): phases = ["train", "val", "test"] phase = phases[type] phaseMetrics = self.metrics[phases[type]] metricValues = [min(phaseMetrics[0]), max( phaseMetrics[1]), max(phaseMetrics[2])] metrics = {} for i, metricName in enumerate(["loss", "acc", "f1score"]): metricName = f"{phase}_{metricName}" metrics[metricName] = metricValues[i] return metrics def _log(self, phase, value, i): self.writer.add_scalar(tag=phase, scalar_value=value, global_step=i) def _write_hp(self, metrics): self.writer.add_hparams(self.hp, metrics) def setLR(self, lr): self.opt.param_groups[0]['lr'] = lr def load_weights(self, pkl_name, num_classes=None, family=None): weights_path = os.path.join(WEIGHTS_DIR, self.exp_name, pkl_name) sd = torch.load(weights_path) self.model.load_state_dict(sd, strict=False) self.model.to(self.device) def _save_weights(self): bestF1Score = max(self.metrics["val"][-1]) if self.metrics["val"][-1][-1] == bestF1Score: weights_path = os.path.join( WEIGHTS_DIR, self.exp_name, self.model_name + ".pkl" ) os.makedirs(os.path.join( WEIGHTS_DIR, self.exp_name), exist_ok=True) self.model.cpu() state = self.model.state_dict() torch.save(state, weights_path) # open(pkl), compress self.model.to(self.device) def getPreds(self, phaseIdx, toSave=False): with torch.no_grad(): preds, _ = self.anEpoch(phaseIdx, toLog=False) if not toSave: return preds dfCopy = self.dls[phaseIdx].dataset.getDF() if len(preds) < len(dfCopy): extra = len(dfCopy) - len(preds) preds = torch.cat([preds, torch.tensor([-1] * extra)]) predCategories = list(map(lambda l: CATEGORY_SUBSET[l], preds.numpy())) dfCopy[PRED_COL] = predCategories dfCopy["correct"] = dfCopy[PRED_COL] == dfCopy[Y_COL] csvPath = os.path.join( PREDS_DIR, f"{self.model_name}_{PHASES[phaseIdx]}_preds.csv") dfCopy.to_csv(csvPath, index=False) return preds
class D3RLPyLogger: _experiment_name: str _logdir: str _save_metrics: bool _verbose: bool _metrics_buffer: Dict[str, List[float]] _params: Optional[Dict[str, float]] _writer: Optional[SummaryWriter] def __init__( self, experiment_name: str, save_metrics: bool = True, root_dir: str = "logs", verbose: bool = True, tensorboard: bool = True, with_timestamp: bool = True, ): self._save_metrics = save_metrics self._verbose = verbose # add timestamp to prevent unintentional overwrites while True: if with_timestamp: date = datetime.now().strftime("%Y%m%d%H%M%S") self._experiment_name = experiment_name + "_" + date else: self._experiment_name = experiment_name if self._save_metrics: self._logdir = os.path.join(root_dir, self._experiment_name) if not os.path.exists(self._logdir): os.makedirs(self._logdir) break if with_timestamp: time.sleep(1.0) else: raise ValueError("%s already exists." % self._logdir) else: break self._metrics_buffer = {} if tensorboard: tfboard_path = os.path.join("runs", self._experiment_name) self._writer = SummaryWriter(logdir=tfboard_path) else: self._writer = None self._params = None def add_params(self, params: Dict[str, Any]) -> None: assert self._params is None, "add_params can be called only once." if self._save_metrics: # save dictionary as json file with open(os.path.join(self._logdir, "params.json"), "w") as f: json_str = json.dumps(params, default=default_json_encoder, indent=2) f.write(json_str) if self._verbose: for key, val in params.items(): print("{}={}".format(key, val)) # remove non-scaler values for HParams self._params = {k: v for k, v in params.items() if np.isscalar(v)} def add_metric(self, name: str, value: float) -> None: if name not in self._metrics_buffer: self._metrics_buffer[name] = [] self._metrics_buffer[name].append(value) def commit(self, epoch: int, step: int) -> None: metrics = {} for name, buffer in self._metrics_buffer.items(): metric = sum(buffer) / len(buffer) if self._save_metrics: with open(os.path.join(self._logdir, name + ".csv"), "a") as f: print("%d,%d,%f" % (epoch, step, metric), file=f) if self._verbose: print("epoch=%d step=%d %s=%f" % (epoch, step, name, metric)) if self._writer: self._writer.add_scalar("metrics/" + name, metric, epoch) metrics[name] = metric if self._params and self._writer: self._writer.add_hparams( self._params, metrics, name=self._experiment_name, global_step=epoch, ) # initialize metrics buffer self._metrics_buffer = {} def save_model(self, epoch: int, algo: _SaveProtocol) -> None: if self._save_metrics: # save entire model model_path = os.path.join(self._logdir, "model_%d.pt" % epoch) algo.save_model(model_path) @contextmanager def measure_time(self, name: str) -> Iterator[None]: name = "time_" + name start = time.time() try: yield finally: self.add_metric(name, time.time() - start) @property def logdir(self) -> str: return self._logdir @property def experiment_name(self) -> str: return self._experiment_name
class Writer: _STDOUT = sys.stdout _STDERR = sys.stderr def __init__(self, logdir, make_subdir, tag_group): if make_subdir: os.makedirs(logdir, exist_ok=True) timestamp = f"{datetime.datetime.now().strftime('%b%d_%H-%M-%S')}" logdir = os.path.join(logdir, timestamp) self._writer = SummaryWriter(logdir=logdir) assert logdir == self._writer.logdir self._logdir = logdir self._tag_group = tag_group sys.stdout = Tee(primary_file=self._STDOUT, secondary_file=open(os.path.join(logdir, "stdout"), "a")) sys.stderr = Tee(primary_file=self._STDERR, secondary_file=open(os.path.join(logdir, "stderr"), "a")) def write_scalar(self, tag, scalar_value, global_step=None): self._writer.add_scalar(self._tag(tag), scalar_value, global_step=global_step) def write_image(self, tag, img_tensor, global_step=None): self._writer.add_image(self._tag(tag), img_tensor, global_step=global_step) def write_figure(self, tag, figure, global_step=None): self._writer.add_figure(self._tag(tag), figure, global_step=global_step) def write_hparams(self, hparam_dict=None, metric_dict=None): self._writer.add_hparams(hparam_dict=hparam_dict, metric_dict=metric_dict) def write_json(self, tag, data): text = json.dumps(data, indent=4) self._writer.add_text( self._tag(tag), 4 * " " + text.replace("\n", "\n" + 4 * " ") # Indent by 4 to ensure codeblock formatting ) json_path = os.path.join(self._logdir, f"{tag}.json") with open(json_path, "w") as f: f.write(text) def write_textfile(self, tag, text): path = os.path.join(self._logdir, f"{tag}.txt") with open(path, "w") as f: f.write(text) def write_checkpoint(self, tag, data): os.makedirs(self._checkpoints_dir, exist_ok=True) checkpoint_path = self._checkpoint_path(tag) tmp_checkpoint_path = os.path.join( os.path.dirname(checkpoint_path), f"{os.path.basename(checkpoint_path)}.tmp") torch.save(data, tmp_checkpoint_path) # replace is atomic, so we guarantee our checkpoints are always good os.replace(tmp_checkpoint_path, checkpoint_path) def load_checkpoint(self, tag, device): return torch.load(self._checkpoint_path(tag), map_location=device) def _checkpoint_path(self, tag): return os.path.join(self._checkpoints_dir, f"{tag}.pt") @property def _checkpoints_dir(self): return os.path.join(self._logdir, "checkpoints") def _tag(self, tag): return f"{self._tag_group}/{tag}"
class Logger(): def __init__(self, logdir, logname): self.logdir = logdir assert (os.path.isdir(logdir)) self.dir = os.path.join(logdir, logname) os.mkdir(self.dir) self.tensorboard_dir = os.path.join(self.dir, "tensorboard") os.mkdir(self.tensorboard_dir) self.tensorboard_writer = SummaryWriter(self.tensorboard_dir) self.params = dict() self.plots = dict() self.plots_columns = dict() def update_params(self, params): self.params.update(params) def add_plot(self, name, columns): assert name not in self.plots self.plots[name] = list() self.plots_columns[name] = columns def add_plot_point(self, name, point): self.plots[name].append(point) def get_plot(self, name): return self.plots[name] def save_logs(self): self.save_csv() # Yet not use it # self.save_tensorboard() def save_model(self, model, name): models_path = os.path.join(self.dir, "models") os.makedirs(models_path, exist_ok=True) torch.save(model, os.path.join(models_path, name)) def save_csv(self): plot_path = os.path.join(self.dir, "plots") os.makedirs(plot_path, exist_ok=True) for plot_name, plot_data in self.plots.items(): filename = os.path.join(plot_path, plot_name + ".csv") pd.DataFrame(plot_data, columns=self.plots_columns[plot_name]).to_csv( filename, index=False) params_path = os.path.join(self.dir, "params.csv") pd.DataFrame(self.params.items(), columns=("name", "value")).to_csv(params_path, index=False) def save_tensorboard(self): self.tensorboard_writer.add_hparams(self.params, {}) for plot_name, plot_data in self.plots.items(): for i in range(len(plot_data)): # skip barriers # if plot_data[i] in Barrier.values(): # continue # TODO fix ugly ifs if isinstance(plot_data[i], tuple): self.tensorboard_writer.add_scalar(plot_name, plot_data[i][2], i) else: self.tensorboard_writer.add_scalar(plot_name, plot_data[i], i)
def main(): global net global test_loader global scatter parser = argparse.ArgumentParser() # generic params parser.add_argument( "--name", default=datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), help="Name to store the log file as", ) parser.add_argument("--resume", help="Path to log file to resume from") parser.add_argument("--encoder", default="FSEncoder", help="Encoder") parser.add_argument("--decoder", default="DSPN", help="Decoder") parser.add_argument("--epochs", type=int, default=10, help="Number of epochs to train with") parser.add_argument("--latent", type=int, default=32, help="Dimensionality of latent space") parser.add_argument("--dim", type=int, default=64, help="Dimensionality of hidden layers") parser.add_argument("--lr", type=float, default=1e-2, help="Outer learning rate of model") parser.add_argument("--batch-size", type=int, default=12, help="Batch size to train with") parser.add_argument("--num-workers", type=int, default=0, help="Number of threads for data loader") parser.add_argument( "--dataset", choices=[ "mnist", "clevr-box", "clevr-state", "cats", "merged", "wflw" ], help="Which dataset to use", ) parser.add_argument( "--no-cuda", action="store_true", help="Run on CPU instead of GPU (not recommended)", ) parser.add_argument("--train-only", action="store_true", help="Only run training, no evaluation") parser.add_argument("--eval-only", action="store_true", help="Only run evaluation, no training") parser.add_argument("--multi-gpu", action="store_true", help="Use multiple GPUs") parser.add_argument("--show", action="store_true", help="Plot generated samples in Tensorboard") parser.add_argument( "--show-skip", type=int, default=1, help="Number of epochs to skip before exporting to Tensorboard") parser.add_argument( "--infer-name", action="store_true", help="Automatically name run based on dataset/run number") parser.add_argument("--supervised", action="store_true", help="") parser.add_argument("--baseline", action="store_true", help="Use baseline model") parser.add_argument("--export-dir", type=str, help="Directory to output samples to") parser.add_argument("--export-n", type=int, default=10**9, help="How many samples to output") parser.add_argument( "--export-progress", action="store_true", help="Output intermediate set predictions for DSPN?", ) parser.add_argument( "--full-eval", action="store_true", help="Use full evaluation set (default: 1/10 of evaluation data)", # don't need full evaluation when training to save some time ) parser.add_argument( "--mask-feature", action="store_true", help="Treat mask as a feature to compute loss with", ) parser.add_argument( "--inner-lr", type=float, default=800, help="Learning rate of DSPN inner optimisation", ) parser.add_argument( "--iters", type=int, default=10, help="How many DSPN inner optimisation iteration to take", ) parser.add_argument( "--huber-repr", type=float, default=1, help="Scaling of repr loss term for DSPN supervised learning", ) parser.add_argument( "--loss", choices=["hungarian", "chamfer", "emd"], default="emd", help="Type of loss used", ) parser.add_argument( "--export-csv", action="store_true", help="Only perform predictions, don't evaluate in any way") parser.add_argument("--eval-split", help="Overwrite split on test set") args = parser.parse_args() if args.infer_name: if args.baseline: prefix = "base" else: prefix = "dspn" used_nums = [] if not os.path.exists("runs"): os.makedirs("runs") runs = os.listdir("runs") for run in runs: if args.dataset in run: used_nums.append(int(run.split("-")[-1])) num = 1 while num in used_nums: num += 1 name = f"{prefix}-{args.dataset}-{num}" else: name = args.name print(f"Saving run to runs/{name}") train_writer = SummaryWriter(f"runs/{name}", purge_step=0) net = model.build_net(args) if not args.no_cuda: net = net.cuda() if args.multi_gpu: net = torch.nn.DataParallel(net) optimizer = torch.optim.Adam( [p for p in net.parameters() if p.requires_grad], lr=args.lr) print("Building dataloader") if args.dataset == "mnist": dataset_train = data.MNISTSet(train=True, full=args.full_eval) dataset_test = data.MNISTSet(train=False, full=args.full_eval) elif args.dataset in ["clevr-box", "clevr-state"]: dataset_train = data.CLEVR("clevr", "train", box=args.dataset == "clevr-box", full=args.full_eval) dataset_test = data.CLEVR("clevr", "val", box=args.dataset == "clevr-box", full=args.full_eval) elif args.dataset == "cats": dataset_train = data.Cats("cats", "train", 9, full=args.full_eval) dataset_test = data.Cats("cats", "val", 9, full=args.full_eval) elif args.dataset == "faces": dataset_train = data.Faces("faces", "train", 4, full=args.full_eval) dataset_test = data.Faces("faces", "val", 4, full=args.full_eval) elif args.dataset == "wflw": if args.eval_split: eval_split = f"test_{args.eval_split}" else: eval_split = "test" dataset_train = data.WFLW("wflw", "train", 7, full=args.full_eval) dataset_test = data.WFLW("wflw", eval_split, 7, full=args.full_eval) elif args.dataset == "merged": # merged cats and human faces dataset_train_cats = data.Cats("cats", "train", 9, full=args.full_eval) dataset_train_wflw = data.WFLW("wflw", "train", 9, full=args.full_eval) dataset_test_cats = data.Cats("cats", "val", 9, full=args.full_eval) dataset_test_wflw = data.WFLW("wflw", "test", 9, full=args.full_eval) dataset_train = data.MergedDataset(dataset_train_cats, dataset_train_wflw) dataset_test = data.MergedDataset(dataset_test_cats, dataset_test_wflw) if not args.eval_only: train_loader = data.get_loader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) if not args.train_only: test_loader = data.get_loader(dataset_test, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False) tracker = track.Tracker( train_mae=track.ExpMean(), train_last=track.ExpMean(), train_loss=track.ExpMean(), test_mae=track.Mean(), test_last=track.Mean(), test_loss=track.Mean(), ) if args.resume: log = torch.load(args.resume) weights = log["weights"] n = net if args.multi_gpu: n = n.module n.load_state_dict(weights, strict=True) if args.export_csv: names = [] predictions = [] export_targets = [] def run(net, loader, optimizer, train=False, epoch=0, pool=None): writer = train_writer if train: net.train() prefix = "train" torch.set_grad_enabled(True) else: net.eval() prefix = "test" torch.set_grad_enabled(False) if args.export_dir: true_export = [] pred_export = [] iters_per_epoch = len(loader) loader = tqdm( loader, ncols=0, desc="{1} E{0:02d}".format(epoch, "train" if train else "test "), ) for i, sample in enumerate(loader, start=epoch * iters_per_epoch): # input is either a set or an image input, target_set, target_mask = map(lambda x: x.cuda(), sample) # forward evaluation through the network (progress, masks, evals, gradn), (y_enc, y_label) = net(input, target_set, target_mask) progress_only = progress # if using mask as feature, concat mask feature into progress if args.mask_feature: target_set = torch.cat( [target_set, target_mask.unsqueeze(dim=1)], dim=1) progress = [ torch.cat([p, m.unsqueeze(dim=1)], dim=1) for p, m in zip(progress, masks) ] if args.loss == "chamfer": # dim 0 is over the inner iteration steps # target set is broadcasted over dim 0 set_loss = utils.chamfer_loss(torch.stack(progress), target_set.unsqueeze(0)) elif args.loss == "hungarian": set_loss = utils.hungarian_loss(progress[-1], target_set, thread_pool=pool).unsqueeze(0) elif args.loss == "emd": set_loss = utils.emd(progress[-1], target_set).unsqueeze(0) # Only use representation loss with DSPN and when doing general # supervised prediction, not when auto-encoding if args.supervised and not args.baseline: repr_loss = args.huber_repr * F.smooth_l1_loss(y_enc, y_label) loss = set_loss.mean() + repr_loss.mean() else: loss = set_loss.mean() # restore progress variable to not contain masks for correct # exporting progress = progress_only # Outer optim step if train: optimizer.zero_grad() loss.backward() optimizer.step() # Tensorboard tracking of metrics for debugging tracked_last = tracker.update(f"{prefix}_last", set_loss[-1].item()) tracked_loss = tracker.update(f"{prefix}_loss", loss.item()) if train: writer.add_scalar("metric/set-loss", loss.item(), global_step=i) writer.add_scalar("metric/set-last", set_loss[-1].mean().item(), global_step=i) if not args.baseline: writer.add_scalar("metric/eval-first", evals[0].mean().item(), global_step=i) writer.add_scalar("metric/eval-last", evals[-1].mean().item(), global_step=i) writer.add_scalar("metric/max-inner-grad-norm", max(g.item() for g in gradn), global_step=i) writer.add_scalar("metric/mean-inner-grad-norm", sum(g.item() for g in gradn) / len(gradn), global_step=i) if args.supervised: writer.add_scalar("metric/repr_loss", repr_loss.item(), global_step=i) # Print current progress to progress bar fmt = "{:.6f}".format loader.set_postfix(last=fmt(tracked_last), loss=fmt(tracked_loss), bad=fmt(evals[-1].detach().cpu().item() * 1000) if not args.baseline else 0) if args.export_dir: # export last inner optim of each input as csv # (one input per row) if args.export_csv: # the second to last element are the last of the # inner optim for batch_i, p in enumerate(progress[-2]): img_id = i * args.batch_size + batch_i names.append(loader.iterable.dataset.get_fname(img_id)) m = masks[-2][batch_i] m = m.cpu().detach().numpy().astype(bool) p = p.cpu().detach().numpy() p = p[:, m] sample_preds = [ p[k % 2, k // 2] for k in range(p.shape[1] * 2) ] # remove values according to mask and add zeros to the # end in stead sample_preds += [0] * (len(m) * 2 - len(sample_preds)) predictions.append(sample_preds) true_mask = target_set[batch_i, 2, :].cpu().detach() true_mask = true_mask.numpy().astype(bool) trues = target_set[batch_i, :2, :] trues = trues.cpu().detach().numpy() t = trues[:, true_mask] t = [t[k % 2, k // 2] for k in range(t.shape[1] * 2)] t += [0] * (len(true_mask) * 2 - len(t)) export_targets.append(t) # Store predictions to be exported else: if len(true_export) < args.export_n: for p, m in zip(target_set, target_mask): true_export.append(p.detach().cpu()) progress_steps = [] for pro, ms in zip(progress, masks): # pro and ms are one step of the inner optim # score boxes contains the list of predicted # elements for one step score_boxes = [] for p, m in zip(pro.cpu().detach(), ms.cpu().detach()): score_box = torch.cat([m.unsqueeze(0), p], dim=0) score_boxes.append(score_box) progress_steps.append(score_boxes) for b in zip(*progress_steps): pred_export.append(b) # Plot predictions in Tensorboard if args.show and epoch % args.show_skip == 0 and not train: name = f"set/epoch-{epoch}/img-{i}" # thresholded set progress.append(progress[-1]) masks.append((masks[-1] > 0.5).float()) # target set if args.mask_feature: # target set is augmented with masks, so remove them progress.append(target_set[:, :-1]) else: progress.append(target_set) masks.append(target_mask) # intermediate sets for j, (s, ms) in enumerate(zip(progress, masks)): if args.dataset == "clevr-state": continue if args.dataset.startswith("clevr"): threshold = 0.5 else: threshold = None s, ms = utils.scatter_masked( s, ms, binned=args.dataset.startswith("clevr"), threshold=threshold) if j != len(progress) - 1: tag_name = f"{name}" else: tag_name = f"{name}-target" if args.dataset == "clevr-box": img = input[0].detach().cpu() writer.add_image_with_boxes(tag_name, img, s.transpose(0, 1), global_step=j) elif args.dataset == "cats" \ or args.dataset == "wflw" \ or args.dataset == "merged": img = input[0].detach().cpu() fig = plt.figure() plt.scatter(s[0, :] * 128, s[1, :] * 128) plt.imshow(np.transpose(img, (1, 2, 0))) writer.add_figure(tag_name, fig, global_step=j) else: # mnist fig = plt.figure() y, x = s y = 1 - y ms = ms.numpy() rgba_colors = np.zeros((ms.size, 4)) rgba_colors[:, 2] = 1.0 rgba_colors[:, 3] = ms plt.scatter(x, y, color=rgba_colors) plt.axes().set_aspect("equal") plt.xlim(0, 1) plt.ylim(0, 1) writer.add_figure(tag_name, fig, global_step=j) # Export predictions if args.export_dir and not args.export_csv: os.makedirs(f"{args.export_dir}/groundtruths", exist_ok=True) os.makedirs(f"{args.export_dir}/detections", exist_ok=True) for i, (gt, dets) in enumerate(zip(true_export, pred_export)): export_groundtruths_path = os.path.join( args.export_dir, "groundtruths", f"{i}.txt") with open(export_groundtruths_path, "w") as fd: for box in gt.transpose(0, 1): if (box == 0).all(): continue s = "box " + " ".join(map(str, box.tolist())) fd.write(s + "\n") if args.export_progress: for step, det in enumerate(dets): export_progress_path = os.path.join( args.export_dir, "detections", f"{i}-step{step}.txt") with open(export_progress_path, "w") as fd: for sbox in det.transpose(0, 1): s = f"box " + " ".join(map(str, sbox.tolist())) fd.write(s + "\n") export_path = os.path.join(args.export_dir, "detections", f"{i}.txt") with open(export_path, "w") as fd: for sbox in dets[-1].transpose(0, 1): s = f"box " + " ".join(map(str, sbox.tolist())) fd.write(s + "\n") import subprocess git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]) # git_hash = "483igtrfiuey46" torch.backends.cudnn.benchmark = True metrics = {} start = time.time() if args.eval_only: tracker.new_epoch() with mp.Pool(10) as pool: run(net, test_loader, optimizer, train=False, epoch=0, pool=pool) metrics["test_loss"] = np.mean(tracker.data["test_loss"][-1]) metrics["test_set_loss"] = np.mean(tracker.data["test_last"][-1]) else: best_test_loss = float("inf") for epoch in range(args.epochs): tracker.new_epoch() with mp.Pool(10) as pool: run(net, train_loader, optimizer, train=True, epoch=epoch, pool=pool) if not args.train_only: run(net, test_loader, optimizer, train=False, epoch=epoch, pool=pool) epoch_test_loss = np.mean(tracker.data["test_loss"][-1]) if epoch_test_loss < best_test_loss: print("new best loss") best_test_loss = epoch_test_loss # only save if the epoch has lower loss metrics["test_loss"] = epoch_test_loss metrics["train_loss"] = np.mean(tracker.data["train_loss"][-1]) metrics["train_set_loss"] = np.mean( tracker.data["train_last"][-1]) metrics["test_set_loss"] = np.mean( tracker.data["test_last"][-1]) metrics["best_epoch"] = epoch results = { "name": name + "-best", "tracker": tracker.data, "weights": net.state_dict() if not args.multi_gpu else net.module.state_dict(), "args": vars(args), "hash": git_hash, } torch.save(results, os.path.join("logs", name + "-best")) results = { "name": name + "-final", "tracker": tracker.data, "weights": net.state_dict() if not args.multi_gpu else net.module.state_dict(), "args": vars(args), "hash": git_hash, } torch.save(results, os.path.join("logs", name + "-final")) if args.export_csv and args.export_dir: path = os.path.join(args.export_dir, f'{args.name}-predictions.csv') pd.DataFrame(np.array(predictions), index=names).to_csv(path, sep=',', index=names, header=False) path = os.path.join(args.export_dir, f'{args.name}-targets.csv') pd.DataFrame(np.array(export_targets), index=names).to_csv(path, sep=',', index=names, header=False) took = time.time() - start print(f"Process took {took:.1f}s, avg {took/args.epochs:.1f} s/epoch.") # save hyper parameters to tensorboard for reference hparams = {k: v for k, v in vars(args).items() if v is not None} print(metrics) metrics = {"total_time": took, "avg_time_per_epoch": took / args.epochs} print("writing hparams") train_writer.add_hparams(hparams, metric_dict=metrics, name="hparams")
def run_exp(first_n, lr, weight_decay, cross_ent_weight, batch_size, np_th_seed, debug, n_epochs, n_mixes, output_dir, scale_2_cross_ent, mask_for_cross_ent, nll_weight, linear_classifier, flow_gmm, flow_coupling): hparams = {k: v for k, v in locals().items() if v is not None} noise_factor = 1 / 256.0 if debug: first_n = 512 batch_size = 10 n_epochs = 5 set_random_seeds(np_th_seed, True) writer = SummaryWriter(output_dir) writer.add_hparams(hparams, metric_dict={}, name=output_dir) writer.flush() model = create_glow_model(hidden_channels=512, K=32, L=3, flow_permutation='invconv', flow_coupling=flow_coupling, LU_decomposed=True, n_chans=3, block_type='conv', use_act_norm=True) if flow_coupling == 'additive': state_dict = th.load( '/home/schirrmr/data/exps/invertible/additive/7/state_dicts_model_250.pth' ) else: assert flow_coupling == 'affine' state_dict = th.load( '/home/schirrmr/data/exps/invertible/finetune//12/state_dicts_model_76.pth' ) for key in state_dict.keys(): if 'loc' in key or 'log_scale' in key: state_dict[key].squeeze_() model.load_state_dict(state_dict) del state_dict pre_dist_model = convert_glow_to_pre_dist_model(model, as_list=True) del model if flow_gmm: dist0 = NClassIndependentDist(10, n_dims=3072 // 2, optimize_mean=False, optimize_std=False) dist1 = NClassIndependentDist(10, n_dims=3072 // 4, optimize_mean=False, optimize_std=False) dist2 = NClassIndependentDist(10, n_dims=3072 // 4, optimize_mean=False, optimize_std=False) dist0.class_means.normal_(mean=0, std=1) dist1.class_means.normal_(mean=0, std=1) dist2.class_means.normal_(mean=0, std=1) else: init_dist_std = 1e-1 dist0 = PerDimWeightedMix(10, n_mixes=n_mixes, n_dims=3072 // 2, optimize_mean=True, optimize_std=True, init_std=init_dist_std) dist1 = PerDimWeightedMix(10, n_mixes=n_mixes, n_dims=3072 // 4, optimize_mean=True, optimize_std=True, init_std=init_dist_std) dist2 = PerDimWeightedMix(10, n_mixes=n_mixes, n_dims=3072 // 4, optimize_mean=True, optimize_std=True, init_std=init_dist_std) model = Node(pre_dist_model, ApplyToList(dist0, dist1, dist2)) net = model.cuda() init_all_modules(net, None) if mask_for_cross_ent: alphas_mask = th.zeros(768, requires_grad=True, device='cuda') if linear_classifier: clf = th.nn.Linear(768, 10).cuda() train_loader, valid_loader = load_train_test( 'cifar10', shuffle_train=True, drop_last_train=True, batch_size=batch_size, eval_batch_size=256, n_workers=8, first_n=first_n, augment=True, exclude_cifar_from_tiny=False, ) optim = th.optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay) if mask_for_cross_ent: optim.add_param_group( dict(params=[alphas_mask], lr=5e-2, weight_decay=0)) if linear_classifier: optim.add_param_group( dict(params=clf.parameters(), lr=lr, weight_decay=weight_decay)) def get_lp_for_cross_ent(z, lp, net, scale_2_cross_ent, mask_for_cross_ent): dists = list(net.module.module_list.children()) if linear_classifier: lp_for_cross_ent = clf(z[2]) else: if scale_2_cross_ent: lp_for_cross_ent = dists[2](z[2], fixed=dict(sum_dims=False))[1] if mask_for_cross_ent: mask = th.sigmoid(alphas_mask) lp_for_cross_ent = lp_for_cross_ent * mask.unsqueeze( 0).unsqueeze(0) lp_for_cross_ent = lp_for_cross_ent.sum(dim=-1) else: lp_for_cross_ent = lp return lp_for_cross_ent for i_epoch in range(n_epochs + 1): if i_epoch > 0: for X, y in train_loader: y = y.cuda() noise = th.rand_like(X) * 1 / 256.0 noised = X + noise z, lp = net(noised.cuda(), fixed=dict(y=None)) lp_for_cross_ent = get_lp_for_cross_ent( z, lp, net, scale_2_cross_ent, mask_for_cross_ent) cross_ent = th.nn.functional.cross_entropy( lp_for_cross_ent, y.argmax(dim=1), ) nll = -th.mean(th.sum(lp * y, dim=1)) loss = cross_ent_weight * cross_ent + nll_weight * nll optim.zero_grad() loss.backward() optim.step() optim.zero_grad() del y, noise, noised, lp, cross_ent, nll, loss print(i_epoch) results = {} with th.no_grad(): for name, loader in (('Train', train_loader), ('Valid', valid_loader)): all_lps = [] all_corrects = [] for X, y in loader: y = y.cuda() # First with noise to get nll for bpd, # then without noise for accâuracy noise = th.rand_like(X) * 1 / 256.0 noised = X + noise noise_log_prob = np.log(256) * np.prod(X.shape[1:]) z, lp = net(noised.cuda()) lps = to_numpy(th.sum(lp * y, dim=1) - noise_log_prob) all_lps.extend(lps) z, lp = net(X.cuda() + (1 / (2 * 256.0))) lp_for_cross_ent = get_lp_for_cross_ent( z, lp, net, scale_2_cross_ent, mask_for_cross_ent) corrects = to_numpy( y.argmax(dim=1) == lp_for_cross_ent.argmax(dim=1)) all_corrects.extend(corrects) acc = np.mean(all_corrects) nll = -(np.mean(all_lps) / (np.prod(X.shape[1:]) * np.log(2))) print(f"{name} NLL: {nll:.2f}") print(f"{name} Acc: {acc:.1%}") results[f"{name.lower()}_nll"] = nll results[f"{name.lower()}_acc"] = acc writer.add_scalar(f"{name.lower()}_nll", nll, i_epoch) writer.add_scalar(f"{name.lower()}_acc", acc * 100, i_epoch) del noise, noised, z, lp, lps writer.flush() sys.stdout.flush() if not debug: dict_path = os.path.join(output_dir, "model_dict.th") th.save(net.state_dict(), open(dict_path, 'wb')) if mask_for_cross_ent: mask_path = os.path.join(output_dir, "alphas_mask.th") th.save(alphas_mask, open(mask_path, 'wb')) model_path = os.path.join(output_dir, "model.th") th.save(net, open(model_path, 'wb')) return results
def main(): """ Run the experiment. """ # TensorboardX tbx_writer = SummaryWriter(comment="pendulum_naive_AIRL") tbx_writer.add_hparams(vars(args), {}) # env related env = gym.make("Pendulum-v0") feature_extractor = IdentityFeatureExtractor() state_size = feature_extractor.extract_features(env.reset()).shape[0] # rl related replay_buffer = ReplayBuffer(args.replay_buffer_length) rl = SoftActorCritic( env, replay_buffer, feature_extractor, args.replay_buffer_sample_size, entropy_target=args.entropy_target, entropy_tuning=args.disable_entropy_tuning, tau=args.tau, log_alpha=args.log_alpha, play_interval=args.play_interval, tbx_writer=tbx_writer, learning_rate=1e-3, ) # irl related expert_policy = PolicyNetwork(state_size, env.action_space, NN_HIDDEN_WIDTH) expert_policy.load("../pendulum_policies/pendulum_expert.pt") expert = PolicyExpert(expert_policy, env, args.num_expert_trajs, args.max_env_steps) expert_states = expert.get_expert_states() expert_actions = expert.get_expert_actions() irl = NaiveAIRL( rl, env, expert_states, expert_actions, tbx_writer=tbx_writer, learning_rate=1e-3, ) irl.train( args.irl_episodes, args.irl_traj_per_ep, args.max_env_steps, args.irl_num_policy_updates, ) import pdb pdb.set_trace()