def visualize(logging_file_path):
    file = open(logging_file_path)
    log_entrys = file.readlines()
    writer = SummaryWriter()
    n_iter = 0
    config_dict = dict()
    score_dict = dict()
    for log_entry in log_entrys:
        if re.match('\[INFO\]', log_entry) is not None:  # start parsing
            # print("new iteration:")
            n_iter += 1
            config_dict = dict()
            score_dict = dict()
        elif re.match(', result is:',
                      log_entry) is None:  # continue parsing configuration
            search_obj = re.search(r'(.*), Value: (.*)', log_entry)
            config_dict[str(search_obj.group(1))] = float(search_obj.group(2))
            # print('key is: ', str(search_obj.group(1)), ' value is : ', config_dict[str(search_obj.group(1))])
        else:  # parsing performance and end
            search_obj = re.search(', result is: (.*)', log_entry)
            score_dict['_perf'] = float(search_obj.group(1))
            # print("end parsing------------")
            # print("config_dict = ", config_dict)
            # print("score_dicr = ", score_dict)
            writer.add_hparams(config_dict,
                               score_dict,
                               name="trial" + str(n_iter))
            writer.add_scalar('_perf', score_dict['_perf'], n_iter)
            writer.add_scalars('data/timeline', score_dict, n_iter)

    # writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
Exemplo n.º 2
0
def main():
    _current_datetime = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    ## arguments ##
    parser = argparse.ArgumentParser(description=__doc__,
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # train
    parser.add_argument('--warmup', default=10000, type=int,
                        help='number of warmup steps')
    parser.add_argument('--episode', default=1200, type=int,
                        help='upper limit of training episodes')
    parser.add_argument('--capacity', default=10000, type=int,
                        help='capacity of replay buffer')
    parser.add_argument('--batch_size', default=128, type=int,
                        help='mini batch size extract from replay buffer')
    parser.add_argument('--lr', default=.0005, type=float,
                        help='learning rate')
    parser.add_argument('--eps_decay', default=.995, type=float,
                        help='epsilon decay rate')
    parser.add_argument('--eps_min', default=.01, type=float,
                        help='lower bound of epsilon')
    parser.add_argument('--gamma', default=.99, type=float,
                        help='gamma for update Q value')
    parser.add_argument('--freq', default=4, type=int,
                        help='interval to update behavior network')
    parser.add_argument('--target_freq', default=1000, type=int,
                        help='interval to update target network')
    # test
    parser.add_argument('--test_only', action='store_true',
                        help='conduct test only runs')
    parser.add_argument('--render', default=False, action='store_true',
                        help='render display')
    parser.add_argument('--test_epsilon', default=.001, type=float,
                        help='test epsilon')
    # utilities
    parser.add_argument('-d', '--device', default='cuda',
                        help='device used for training / testing')
    parser.add_argument('-m', '--model', default='models/ddqn-{}.pth'.format(_current_datetime),
                        help='path to pretrained model / model save path')
    parser.add_argument('--logdir', default='log/ddqn/{}'.format(_current_datetime),
                        help='path to tensorboard log')
    parser.add_argument('--seed', default=2021111, type=int,
                        help='random seed')
    args = parser.parse_args()

    ## main ##
    env_name = 'LunarLander-v2'
    agent = DDQN(args)
    writer = SummaryWriter(args.logdir)
    if not args.test_only:
        os.makedirs('checkpoints', exist_ok=True)
        os.makedirs('models', exist_ok=True)
        ewma_reward = train(args, env_name, agent, writer)
        writer.add_hparams(args.__dict__,{'hparams/Ewma Reward': ewma_reward})
        agent.save(args.model)
    agent.load(args.model)
    test(args, env_name, agent, writer)
class TensorboardX:
    def __init__(self, logdir, run_name):
        self._logdir = logdir
        self._writer = SummaryWriter(logdir=logdir)
        self.run_name = run_name

    def args(self, arg_text):
        self._writer.add_text("args", arg_text)

    def meta(self, params):
        self._writer.add_hparams(hparam_dict=params, metric_dict={})

    def log(self, name, value, step):
        self._writer.add_scalar(name, value, step)
Exemplo n.º 4
0
    def init_tensorboard(self):
        from tensorboardX import SummaryWriter

        writer = SummaryWriter()

        data = {
            "agent_config": self._agent_config,
            "exp_config": self._exp_config
        }

        writer.add_hparams(self._exp_config, {})
        writer.add_hparams(self._agent_config, {})

        return writer
Exemplo n.º 5
0
def visualize(logging_file_path):
    print('---------entering visualize-----------')
    file = open(logging_file_path)
    log_entrys = file.readlines()
    # writer = SummaryWriter(write_to_disk=False)             # create the SummaryWriter Object
    writer = SummaryWriter(
        write_to_disk=True)  # create the SummaryWriter Object
    n_iter = 0
    config_dict = dict()
    score_dict = dict()
    score_bound_dict = dict()
    for log_entry in log_entrys:
        if re.match('\[INFO\]', log_entry) is not None:  # start parsing
            # print("new iteration:")
            n_iter += 1
            config_dict = dict()
            score_dict = dict()
        elif re.match(', result is:',
                      log_entry) is None:  # continue parsing configuration
            search_obj = re.search(r'(.*), Value: (.*)', log_entry)
            if search_obj is None:
                search_obj = re.search(r'(.*), Constant: (.*)', log_entry)
            config_dict[str(search_obj.group(1))] = float(search_obj.group(2))
            print('key is: ', str(search_obj.group(1)), ' value is : ',
                  config_dict[str(search_obj.group(1))])
        else:  # parsing performance and end
            search_obj = re.search(', result is: (.*)', log_entry)
            score_dict['_perf'] = float(search_obj.group(1))
            if n_iter == 1:
                lower_bound = score_dict['_perf']
                score_bound_dict = {'lower_bound': lower_bound}
            else:
                lower_bound = min(lower_bound, score_dict['_perf'])
                score_bound_dict = {'lower_bound': lower_bound}

            # print("end parsing------------")
            # print("config_dict = ", config_dict)
            # print("score_dicr = ", score_dict)
            writer.add_hparams(config_dict,
                               score_dict,
                               name="trial" + str(n_iter))
            writer.add_scalar('_perf', score_dict['_perf'], n_iter)
            writer.add_scalars('data/score_bound', score_bound_dict, n_iter)

    # writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
Exemplo n.º 6
0
def main(cfg: omegaconf.DictConfig):

	# create the environment
	env = atari_wrappers.make_env(cfg.exp.env)
	env = gym.wrappers.Monitor(env, "recording/", force=True)
	obs = env.reset()

	# TensorBoard
	writer = SummaryWriter()
	writer.add_hparams(flatten_dict(cfg), {})
	logger.info('Hyperparams:', cfg)

	# create the agent
	agent = DQNAgent(env, device=cfg.train.device, summary_writer=writer, cfg=cfg)

	n_games = 0
	max_mean_40_reward = -sys.maxsize

	# Play MAX_N_GAMES games
	while n_games < cfg.train.max_episodes:
		# act greedly
		action = agent.act_eps_greedy(obs)

		# one step on the environment
		new_obs, reward, done, _ = env.step(action)

		# add the environment feedback to the agent
		agent.add_env_feedback(obs, action, new_obs, reward, done)

		# sample and optimize NB: the agent could wait to have enough memories
		agent.sample_and_optimize(cfg.train.batch_size)

		obs = new_obs
		if done:
			n_games += 1
			agent.print_info()
			agent.reset_stats()
			obs = env.reset()
			if agent.rewards:
				current_mean_40_reward = np.mean(agent.rewards[-40:])
				if current_mean_40_reward > max_mean_40_reward:
					agent.save_model(cfg.train.best_checkpoint)
	writer.close()
Exemplo n.º 7
0
class TensorboardSession(Session):
    writer: SummaryWriter

    def __init__(self, source_paths: Union[List[str], str], **kwargs) -> None:
        self.writer = SummaryWriter()
        source_md = []
        for path in source_paths:
            if isfile(path):
                fs = open(path, mode="r")
                source_md.append(f"* {basename(path)}\n\n\"\"\"python")
                source_md.append(fs.read())
                source_md.append("\"\"\"\n\n")
                fs.close()
            else:
                print(f"CometSession: Warning, No such file - {path}")
        self.writer.add_text("Source codes", "\n".join(source_md))

    def log_parameters(self, params: Dict[str, Any]) -> None:
        self.writer.add_hparams(params)

    def log_metric(self, val_name: str, value: Any) -> None:
        self.writer.add_scalar(val_name, value)
Exemplo n.º 8
0
class Logger(object):
    """
    Logger class to use tensorboard to visualize ANNarchy simulations. Requires the `tensorboardX` package (pip install tensorboardX). 

    The Logger class is a thin wrapper around tensorboardX.SummaryWriter, which you could also use directly. The doc is available at <https://tensorboardx.readthedocs.io/>. Tensorboard can read any logging data, as long as they are saved in the right format (tfevents), so it is not limited to tensorflow. TensorboardX has been developed to allow the use of tensorboard with pytorch.

    The extension has to be imported explictly:

    ```python
    from ANNarchy.extensions.tensorboard import Logger
    ```

    The ``Logger`` class has to be closed properly at the end of the script, so it is advised to use a context:

    ```python
    with Logger() as logger:
        logger.add_scalar("Accuracy", acc, trial)
    ```

    You can also make sure to close it:

    ```python
    logger = Logger()
    logger.add_scalar("Accuracy", acc, trial)
    logger.close()
    ```

    By default, the logs will be written in a subfolder of ``./runs/`` (which will be created in the current directory). 
    The subfolder is a combination of the current datetime and of the hostname, e.g. ``./runs/Apr22_12-11-22_machine``. 
    You can control these two elements by passing arguments to ``Logger()``:

    ```python
    with Logger(logdir="/tmp/annarchy", experiment="trial1"): # logs in /tmp/annarchy/trial1
    ```

    The ``add_*`` methods allow you to log various structures, such as scalars, images, histograms, figures, etc.

    A tag should be given to each plot. In the example above, the figure with the accuracy will be labelled "Accuracy" in tensorboard. 
    You can also group plots together with tags such as "Global performance/Accuracy", "Global performance/Error rate", "Neural activity/Population 1", etc.

    After (or while) logging data within your simulation, run `tensorboard` in the terminal by specifying the log directory:

    ```bash
    tensorboard --logdir runs
    ```

    TensorboardX enqueues the data in memory before writing to disk. You can force flushing with:

    ```python
    logger.flush()
    ```

    """
    def __init__(self, logdir="runs/", experiment=None):
        """
        :param logdir: path (absolute or relative) to the logging directory. Subfolders will be created for each individual run. The default is "runs/"
        :param experiment: name of the subfolder for the current run. By default, it is a combination of the current time and the hostname (e.g. Apr22_12-11-22_machine). If you reuse an experiment name, the data will be appended.
        """
        self.logdir = logdir
        self.experiment = experiment

        # Create the logdir if it does not exist
        if not os.path.exists(self.logdir):
            os.makedirs(self.logdir)

        if not experiment:
            current_time = datetime.now().strftime('%b%d_%H-%M-%S')
            self.currentlogdir = os.path.join(
                self.logdir, current_time + '_' + socket.gethostname())
        else:
            self.currentlogdir = self.logdir + "/" + self.experiment

        print("Logging in", self.currentlogdir)

        self._create_summary_writer()

    def _create_summary_writer(self):

        self._summary = SummaryWriter(self.currentlogdir,
                                      comment="",
                                      purge_step=None,
                                      max_queue=10,
                                      flush_secs=10,
                                      filename_suffix='',
                                      write_to_disk=True)

    # Logging methods

    def add_scalar(self, tag, value, step=None):
        """
        Logs a single scalar value, e.g. a success rate at various stages of learning.

        Example:

        ```python
        with Logger() as logger:
            for trial in range(100):
                simulate(1000.0)
                accuracy = ...
                logger.add_scalar("Accuracy", accuracy, trial)
        ```

        :param tag: name of the figure in tensorboard.
        :param value: value.
        :param step: time index.
        """

        self._summary.add_scalar(tag=tag,
                                 scalar_value=value,
                                 global_step=step,
                                 walltime=None)

    def add_scalars(self, tag, value, step=None):
        """
        Logs multiple scalar values to be displayed in the same figure, e.g. several metrics or neural activities.

        Example:

        ```python
        with Logger() as logger:
            for trial in range(100):
                simulate(1000.0)
                act1 = pop.r[0]
                act2 = pop.r[1]
                logger.add_scalars(
                    "Accuracy", 
                    {'First neuron': act1, 'Second neuron': act2}, 
                    trial)
        ```

        :param tag: name of the figure in tensorboard.
        :param value: dictionary of values.
        :param step: time index.
        """

        self._summary.add_scalars(main_tag=tag,
                                  tag_scalar_dict=value,
                                  global_step=step,
                                  walltime=None)

    def add_image(self, tag, img, step=None, equalize=False):
        """
        Logs an image.
        
        The image must be a numpy array of size (height, width) for monochrome images or (height, width, 3) for colored images. The values should either be integers between 0 and 255 or floats between 0 and 1. The parameter ``equalize`` forces the values to be between 0 and 1 by equalizing using the min/max values.

        Example::

        ```python
        with Logger() as logger:
            for trial in range(100):
                simulate(1000.0)
                img = pop.r.reshape((10, 10))
                logger.add_image("Population / Firing rate", img, trial, equalize=True)
        ```

        :param tag: name of the figure in tensorboard.
        :param img: array for the image.
        :param step: time index.
        :param equalize: rescales the pixels between 0 and 1 using the min and max values of the array.
        """
        if img.ndim == 2:
            if equalize:
                img = img.astype(np.float)
                img = (img - img.min()) / (img.max() - img.min())

            self._summary.add_image(tag=tag,
                                    img_tensor=img,
                                    global_step=step,
                                    walltime=None,
                                    dataformats='HW')

        elif img.ndim == 3:
            if not img.shape[2] == 3:
                Global._error(
                    "Logger.add_image: color images must be of shape (H, W, 3)."
                )

            if equalize:
                img = np.array(img).astype(np.float)
                img = (img - img.min()) / (img.max() - img.min())

            self._summary.add_image(tag=tag,
                                    img_tensor=img,
                                    global_step=step,
                                    walltime=None,
                                    dataformats='HWC')

        else:
            Global._error(
                "Logger.add_image: images must be of shape (H, W) or (H, W, 3)."
            )

    def add_images(self,
                   tag,
                   img,
                   step=None,
                   equalize=False,
                   equalize_per_image=False):
        """
        Logs a set of images (e.g. receptive fields).
       
        The numpy array must be of size (number, height, width) for monochrome images or (number, height, width, 3) for colored images. The values should either be integers between 0 and 255 or floats between 0 and 1. The parameter ``equalize`` forces the values to be between 0 and 1 by equalizing using the min/max values.

        Example:

        ```python
        with Logger() as logger:
            for trial in range(100):
                simulate(1000.0)
                weights= proj.w.reshape(100, 10, 10) # 100 post neurons, 10*10 pre neurons
                logger.add_images("Projection/Receptive fields", weights, trial, equalize=True)
        ```

        :param tag: name of the figure in tensorboard.
        :param img: array for the images.
        :param step: time index.
        :param equalize: rescales the pixels between 0 and 1 using the min and max values of the array.
        :param equalize_per_image: whether the rescaling should be using the global min/max values of the array, or per image. Has no effect if equalize of False.
 
        """
        if img.ndim == 3:
            img = np.expand_dims(img, axis=3)

        if equalize:
            img = np.array(img).astype(np.float)
            if not equalize_per_image:
                img = (img - img.min()) / (img.max() - img.min())
            else:
                for i in range(img.shape[0]):
                    img[i, ...] = (img[i, ...] - img[i, ...].min()) / (
                        img[i, ...].max() - img[i, ...].min())

        self._summary.add_images(tag=tag,
                                 img_tensor=img,
                                 global_step=step,
                                 walltime=None,
                                 dataformats='NHWC')

    def add_parameters(self, params, metrics):
        """
        Logs parameters of a simulation.

        This should be run only once per simulation, generally at the end. 
        This allows to compare different runs of the same network using 
        different parameter values and study how they influence the global output metrics, 
        such as accuracy, error rate, reaction speed, etc.

        Example:

        ```python
        with Logger() as logger:
            # ...
            logger.add_parameters({'learning_rate': lr, 'tau': tau}, {'accuracy': accuracy})
        ```

        :param params: dictionary of parameters.
        :param metrics: dictionary of metrics.
        """

        self._summary.add_hparams(params, metrics)

    def add_histogram(self, tag, hist, step=None):
        """
        Logs an histogram.

        Example:

        ```python
        with Logger() as logger:
            for trial in range(100):
                simulate(1000.0)
                weights= proj.w.flatten()
                logger.add_histogram("Weight distribution", weights, trial)
        ```


        :param tag: name of the figure in tensorboard.
        :param hist: a list or 1D numpy array of values.
        :param step: time index.
        """

        self._summary.add_histogram(tag, hist, step)

    def add_figure(self, tag, figure, step=None, close=True):
        """
        Logs a Matplotlib figure.

        Example:

        ```python
        with Logger() as logger:
            for trial in range(100):
                simulate(1000.0)
                fig = plt.figure()
                plt.plot(pop.r)
                logger.add_figure("Activity", fig, trial)
        ```

        :param tag: name of the image in tensorboard.
        :param figure: a list or 1D numpy array of values.
        :param step: time index.
        :param close: whether the logger will close the figure when done (default: True).
        """

        import matplotlib.pyplot as plt
        import matplotlib.backends.backend_agg as plt_backend_agg
        canvas = plt_backend_agg.FigureCanvasAgg(figure)
        canvas.draw()
        data = np.frombuffer(canvas.buffer_rgba(), dtype=np.uint8)
        w, h = figure.canvas.get_width_height()
        image_hwc = data.reshape([h, w, 4])[:, :, 0:3]
        image_chw = np.moveaxis(image_hwc, source=2, destination=0)
        if close:
            plt.close(figure)
        self._summary.add_image(tag, image_chw, step)

    # Resource management
    def flush(self):
        "Forces the logged data to be flushed to disk."
        self._summary.flush()

    def close(self):
        "Closes the logger."
        self._summary.close()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
Exemplo n.º 9
0
class TensorboardXLogger():
    def __init__(self, logs_dir="", writer=None):
        '''
        Initialize a tensorboard logger

        Note that this logger relies on tensorboardx and only provide tensorboard hparams log.
        An ImportError will be raised for the lack of tensorboardx

        :param logs_dir: root directory for the log, default to the current working dir
        :param writer: shared tensorboardx SummaryWriter, default to None.
        '''
        self.logs_dir = logs_dir
        self._file_writer = None
        try:
            from tensorboardX import SummaryWriter
        except ImportError:
            print("pip install tensorboardx to see TensorBoard log files.")
            raise
        if writer:
            self._file_writer = writer
        else:
            self._file_writer = SummaryWriter(logdir=self.logs_dir)

    def run(self, config, metric):
        '''
        Write log files(event files)

        The log files is arranged as following:
        self.logs_dir
        |--eventfile_all
        |--Trail_1
        |  |--eventfile_1
        |--Trail_2
        |  |--eventfile_2
        ...
        :param config: A dictionary. Keys are trail name,
            value is a dictionary indicates the trail config
        :param metric: A dictionary. Keys are trail name,
            value is a dictionary indicates the trail metric results

        Example:
        Config = {"run1":{"lr":0.001, "hidden_units": 32}, "run2":{"lr":0.01, "hidden_units": 64}}
        Metric = {"run1":{"acc":0.91, "time": 32.13}, "run2":{"acc":0.93, "time": 61.33}}

        Note that the keys of config and metric should be exactly the same
        '''
        # keys check
        assert config.keys() == metric.keys(),\
            "The keys of config and metric should be exactly the same"

        # validation check
        new_metric = {}
        for key in metric.keys():
            new_metric[key] = {}
            for k, value in metric[key].items():
                if type(value) in VALID_SUMMARY_TYPES and not np.isnan(value):
                    new_metric[key][k] = value
        new_config = {}
        for key in config.keys():
            new_config[key] = {}
            for k, value in config[key].items():
                if value is not None:
                    new_config[key][k] = value

        # hparams log write
        for key in new_metric.keys():
            # new_config[key]["address"] = key
            self._file_writer.add_hparams(new_config[key], new_metric[key])

    def close(self):
        '''
        Close the logger
        '''
        self._file_writer.close()
Exemplo n.º 10
0
    sum_bpd = 0.
    sum_elbo_gap = 0.

    with torch.no_grad():
        for (x, _) in tqdm.tqdm(test_loader):
            try:
                all_metrics = metrics(density, x, num_elbo_samples)
                sum_log_prob += all_metrics["log-prob"].sum().item()
                sum_bpd += all_metrics["bpd"].sum().item()
                sum_elbo_gap += all_metrics["elbo-gap"].sum().item()
            except Exception as e:
                import ipdb; ipdb.set_trace()
                print("Error {0} for path {1}".format(e, path))

    points_in_test = test_loader.dataset.x.shape[0]
    metrics = {
        "bpd": sum_bpd / points_in_test,
        "log-prob": sum_log_prob / points_in_test,
        "elbo-gap": sum_elbo_gap / points_in_test,
        "epoch": checkpoint["epoch"],
        "num-params": num_params(density),
        "test-elbo-samples": num_elbo_samples
    }
    with open(metrics_path, "w") as f:
        json.dump(metrics, f, indent=4)

    metrics = {f"hparams/{k}": v for k, v in metrics.items()}

    writer = SummaryWriter(logdir=path)
    writer.add_hparams(hparam_dict=vals, metric_dict=metrics)
Exemplo n.º 11
0
class D3RLPyLogger:
    def __init__(self,
                 experiment_name,
                 root_dir='logs',
                 verbose=True,
                 tensorboard=True,
                 with_timestamp=True):
        self.verbose = verbose

        # add timestamp to prevent unintentional overwrites
        while True:
            if with_timestamp:
                date = datetime.now().strftime('%Y%m%d%H%M%S')
                self.experiment_name = experiment_name + '_' + date
            else:
                self.experiment_name = experiment_name

            self.logdir = os.path.join(root_dir, self.experiment_name)

            if not os.path.exists(self.logdir):
                os.makedirs(self.logdir)
                break
            else:
                if with_timestamp:
                    time.sleep(1.0)
                else:
                    raise ValueError('%s already exists.' % self.logdir)

        self.metrics_buffer = {}

        if tensorboard:
            from tensorboardX import SummaryWriter
            tfboard_path = os.path.join('runs', self.experiment_name)
            self.writer = SummaryWriter(logdir=tfboard_path)
        else:
            self.writer = None

        self.params = None

    def add_params(self, params):
        assert self.params is None, 'add_params can be called only once.'

        # save dictionary as json file
        with open(os.path.join(self.logdir, 'params.json'), 'w') as f:
            f.write(json.dumps(params, default=default_json_encoder))

        if self.verbose:
            for key, val in params.items():
                print('{}={}'.format(key, val))

        # remove non-scaler values for HParams
        self.params = {k: v for k, v in params.items() if np.isscalar(v)}

    def add_metric(self, name, value):
        if name not in self.metrics_buffer:
            self.metrics_buffer[name] = []
        self.metrics_buffer[name].append(value)

    def commit(self, epoch, step):
        metrics = {}
        for name, buffer in self.metrics_buffer.items():
            metric = sum(buffer) / len(buffer)

            with open(os.path.join(self.logdir, name + '.csv'), 'a') as f:
                print('%d,%d,%f' % (epoch, step, metric), file=f)

            if self.verbose:
                print('epoch=%d step=%d %s=%f' % (epoch, step, name, metric))

            if self.writer:
                self.writer.add_scalar('metrics/' + name, metric, epoch)

            metrics[name] = metric
            self.metrics_buffer[name] = []

        if self.params and self.writer:
            self.writer.add_hparams(self.params,
                                    metrics,
                                    name=self.experiment_name,
                                    global_step=epoch)

    def save_model(self, epoch, algo):
        # save entire model
        model_path = os.path.join(self.logdir, 'model_%d.pt' % epoch)
        algo.save_model(model_path)
Exemplo n.º 12
0
class BaseTrainer:
    """Base class for all trainers."""
    def __init__(self, model, loss, metrics, optimizer, lr_scheduler, config):
        self.config = config
        self.hparams = get_hparams_from_config(self.config)

        # setup GPU device if available, move model into configured device
        self.device, device_ids = self._prepare_device(config['n_gpu'])
        self.model = model.to(self.device)
        if len(device_ids) > 1:
            self.model = torch.nn.DataParallel(model, device_ids=device_ids)

        self.loss = loss
        self.metrics = metrics
        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler

        self.exp_dir = config.save_dir
        self.checkpoint_dir = config.save_dir
        self.perf_log_path = os.path.join(config.save_dir, 'perf_log.txt')
        self.info_checkpoint_path = os.path.join(config.save_dir,
                                                 'info_checkpoint.txt')
        self.monitoring_path = os.path.join(config.save_dir, 'monitoring.json')

        cfg_trainer = config['trainer']
        self.epochs = cfg_trainer['epochs']
        self.save_period = cfg_trainer['save_period']
        self.monitor = cfg_trainer.get('monitor', 'off')

        self.timer = AverageMeter()

        # configuration to monitor model performance and save best
        if self.monitor == 'off':
            self.mnt_mode = 'off'
            self.mnt_best = 0
        elif self.monitor.startswith('given_epoch'):
            self.mnt_mode, self.given_epoch = self.monitor.split()
            assert self.mnt_mode in ['given_epoch']
            self.mnt_best = 0
            self.given_epoch = int(self.given_epoch)
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ['min', 'max']

            self.mnt_best = inf if self.mnt_mode == 'min' else -inf

            self.early_stop = cfg_trainer.get('early_stop', inf)

        self.start_epoch = 0
        self.epoch = 0
        self.n_samples = 0
        self.n_steps = 0

        self.writer = SummaryWriter(config.log_dir)

        self.include_optim_in_ckpts = config['trainer'].get(
            'include_optim_in_ckpts', False)

        if config.resume is not None:
            self._resume_checkpoint(config.resume)

    @abc.abstractmethod
    def _train_epoch(self, epoch):
        """Training logic for an epoch."""
        raise NotImplementedError

    @abc.abstractmethod
    def _valid_epoch(self, epoch, sets):
        """Validation logic for an epoch."""
        raise NotImplementedError

    def train(self):
        """Full training logic."""
        not_improved_count = 0
        for epoch in range(self.start_epoch, self.epochs + 1):

            self.epoch = epoch
            epoch_start = time.time()

            logger.debug('Starting training epoch %s ...', str(epoch))
            train_start = time.time()
            result = self._train_epoch(epoch)
            for key, val in result.items():
                self.writer.add_scalar(f'{key}', val, epoch)
            self.timer.update('epoch.train', time.time() - train_start)

            logger.debug('Starting evaluating epoch %s ...', str(epoch))
            valid_start = time.time()
            val_log = self._valid_epoch(epoch, sets='continuous_eval')
            logger.debug('Updating val log with results ...')
            result.update(val_log)
            self.timer.update('epoch.valid', time.time() - valid_start)

            checkpoint_start = time.time()
            # save logged informations into log dict
            log = {'epoch': epoch}
            for key, value in result.items():
                # Metrics recorded during the continuous eval
                if key == 'metrics':
                    for dataset_name, dataset_metrics in value.items():
                        for metric_type, metric_dict in dataset_metrics.items(
                        ):
                            for metric_name, metric_value in metric_dict.items(
                            ):
                                log[f'{dataset_name}/{metric_type}'
                                    f'/{metric_name}'] = metric_value
                else:
                    log[key] = value

            # eval model according to configured metric, save best # ckpt as
            # trained_model.
            best = False
            if self.mnt_mode in ['min', 'max']:
                try:
                    # check whether specified metric improved or not, according to
                    # specified metric(mnt_metric)
                    lower = log[self.mnt_metric] <= self.mnt_best
                    higher = log[self.mnt_metric] >= self.mnt_best
                    improved = (self.mnt_mode == 'min' and lower) or \
                               (self.mnt_mode == 'max' and higher)
                except KeyError:
                    logger.warning(
                        'Warning: Metric %s not found, '
                        'perf monitoring is disabled.', self.mnt_metric)
                    self.mnt_mode = 'off'
                    improved = False
                    not_improved_count = 0

                if improved:
                    self.mnt_best = log[self.mnt_metric]
                    not_improved_count = 0
                    best = True
                else:
                    not_improved_count += 1

                if not_improved_count > self.early_stop:
                    logger.info(
                        'Val performance didn\'t improve for %s epochs. '
                        'Training stops.', self.early_stop)
                    break

            # If checkpointing is done intermittently, still save models that
            # outperform the best metric.
            save_best = best and self.mnt_metric != 'epoch'

            if self.mnt_mode in ['given_epoch'] and epoch == self.given_epoch:
                save_best = True

            # Due to the fast runtime/slow HDD combination, checkpointing can dominate
            # the total training time, so we optionally skip checkpoints for some of
            # the first epochs
            if epoch < self.skip_first_n_saves:
                msg = f'Skipping ckpt save at epoch {epoch} < {self.skip_first_n_saves}'
                logger.info(msg)
            elif epoch % self.save_period == 0 or save_best:
                self._save_checkpoint(epoch, save_best=best)

            if epoch > self.num_keep_ckpts:
                self.purge_stale_checkpoints()
            self.timer.update('epoch.checkpoint',
                              time.time() - checkpoint_start)

            self.timer.update('epoch.total', time.time() - epoch_start)
            for key, val in self.timer.dic.items():
                for metric in ['avg', 'sum']:
                    log[f'timer.{key}.{metric}'] = self.timer.dic[key][metric]
                self.writer.add_scalar(f'timer_epoch/{key}',
                                       self.timer.dic[key]['sum'], epoch)
            self.writer.add_text('exp_dir', str(self.exp_dir), epoch)
            self.timer.reset()

            log['mnt_best'] = self.mnt_best
            log['not_improved_count'] = not_improved_count
            self.writer.add_scalar('mnt_best', self.mnt_best, epoch)

            # print results
            for metric_name, metric_value in log.items():
                if '/cols' in metric_name:
                    continue
                if 'timer.' in metric_name:
                    logger.debug(' {:15s}: {}'.format(str(metric_name),
                                                      metric_value))
                else:
                    logger.info(' {:15s}: {}'.format(str(metric_name),
                                                     metric_value))

            # Save main results in the perf log
            log_light = {}
            for key, value in log.items():
                if not key.endswith('cols'):
                    log_light[key] = value
            update_perf_log(log_light, self.perf_log_path)

            # Log results to Tensorboard
            self.writer.add_hparams(self.hparams, {
                'hparam/accuracy': log[self.mnt_metric],
                'hparam/mnt_best': self.mnt_best,
                'hparam/epoch': epoch
            },
                                    name='hparams')

            # # Ray-tune recording
            # try:
            #   from ray.tune import track
            #   acc = log[self.mnt_metric]
            #   track.log(mean_accuracy=acc, exp_dir=self.exp_dir, **log_light)
            # except Exception as e:
            #   print(e)

    def evaluate(self):
        """Final evaluation."""
        sets = 'final_eval'
        ckpt_path = self.config.save_dir / 'trained_model.pth'

        if os.path.exists(ckpt_path):
            self._resume_checkpoint(ckpt_path)
        else:
            msg = (
                f'The checkpoint {ckpt_path} does not exist and cannot be loaded. '
                f'The model will not be resumed to that checkpoint.')
            logger.info(msg)

        final_result = self._valid_epoch(epoch=self.epoch, sets=sets)
        nested_metrics = final_result['metrics']

        log = {}
        for dataset_name, dataset_metrics in nested_metrics.items():
            log[dataset_name] = {}
            for metric_type, metric_dict in dataset_metrics.items():
                for metric_name, metric_value in metric_dict.items():
                    log[dataset_name][
                        f'{metric_type}/{metric_name}/{sets}'] = metric_value

        # Print results
        for dataset_name, metric_dict in log.items():
            logger.info('%s:', dataset_name)
            for metric_name, metric_value in metric_dict.items():
                if '/cols' in metric_name:
                    continue
                if 'timer.' in metric_name:
                    logger.debug(' {:15s}: {}'.format(str(metric_name),
                                                      metric_value))
                else:
                    logger.info(' {:15s}: {}'.format(str(metric_name),
                                                     metric_value))

        # Logging dataset perfs
        save_dir = self.config.save_dir
        results_on_datasets_log_path = os.path.join(save_dir,
                                                    'exp_results.json')
        if os.path.exists(results_on_datasets_log_path):
            with open(results_on_datasets_log_path) as json_file:
                res = json.load(json_file)
        else:
            res = collections.OrderedDict({})
        if 'perfs' not in res.keys():
            res['perfs'] = {}
        res['perfs'] = log
        res['checkpoint_epoch'] = self.loaded_epoch
        logger.info('Best epoch for the monitored metric: %s',
                    self.loaded_epoch)
        with open(results_on_datasets_log_path, 'w') as fp:
            json.dump(res, fp, indent=4)

        exp_completed_flag_path = os.path.join(save_dir,
                                               'exp_completed_flag.txt')
        # Touch the exp_completed_flag_path to mark that the experiment is completed
        with open(exp_completed_flag_path, 'a'):
            os.utime(exp_completed_flag_path, None)

    def test(self, sentence):
        """Final evaluation."""
        sets = 'test'
        ckpt_path = self.config.save_dir / 'trained_model.pth'

        if os.path.exists(ckpt_path):
            self._resume_checkpoint(ckpt_path)
        else:
            msg = (
                f'The checkpoint {ckpt_path} does not exist and cannot be loaded. '
                f'The model will not be resumed to that checkpoint.')
            logger.info(msg)

        self.reading_from = "mult_h5"
        self.cache_dir = os.path.join(os.path.dirname(self.config.demo_dir),
                                      "vid_feat_files", self.reading_from)
        vid_list_path = "train_list_jsfusion.txt"
        vid_list_path = os.path.join(self.config.demo_dir, vid_list_path)
        self.sentence = sentence

        with open(vid_list_path) as f:
            vid_list = f.readlines()
            for i in range(len(vid_list)):
                vid = vid_list[i]
                output_basename = f"{vid[0]}/{vid[1]}/{vid[2]}/{vid}"
                output_basename = output_basename[:-1] + '.h5'
                dataset_file_path = os.path.join(self.cache_dir,
                                                 output_basename)

                with h5py.File(dataset_file_path, "r+") as dataset_file:
                    nb_captions = len([
                        k for k in dataset_file.keys()
                        if k.startswith("raw_captions.")
                    ])

                    for j in range(nb_captions):
                        try:
                            del dataset_file[f"raw_captions.{j}"]
                        except:
                            print(f"raw_captions.{j}" + "already deleted")
                        dt = h5py.special_dtype(vlen=str)
                        dataset_file.create_dataset(f"raw_captions.{j}",
                                                    data=self.sentence,
                                                    dtype=dt)

        final_result = self._valid_epoch(epoch=self.epoch, sets=sets)
        return final_result

    def purge_stale_checkpoints(self):
        """Remove checkpoints that are no longer neededself.

    NOTE: This function assumes that the `best` checkpoint has already been
    renamed
    to have a format that differs from `checkpoint-epoch<num>.pth`
    """
        found_epoch_ckpts = list(
            self.checkpoint_dir.glob('checkpoint-epoch*.pth'))
        if len(found_epoch_ckpts) <= self.num_keep_ckpts:
            return

        # purge the oldest checkpoints
        regex = r'.*checkpoint-epoch(\d+)[.]pth$'
        epochs = [
            int(re.search(regex, str(x)).groups()[0])
            for x in found_epoch_ckpts
        ]
        sorted_ckpts = sorted(list(zip(epochs, found_epoch_ckpts)),
                              key=lambda x: -x[0])

        for epoch, stale_ckpt in sorted_ckpts[self.num_keep_ckpts:]:
            tic = time.time()
            stale_ckpt.unlink()
            msg = (f'removing stale ckpt [epoch {epoch}] '
                   f'[took {time.time() - tic:.2f}s]')
            logger.info(msg)

    def _prepare_device(self, n_gpu_use):
        """Setup GPU device if available, move model into configured device."""
        n_gpu = torch.cuda.device_count()
        msg = f'n_gpu = torch.cuda.device_count(): {n_gpu} (nb of gpus available)'
        logger.debug(msg)
        if n_gpu_use > 0 and n_gpu == 0:
            logger.warning(
                'Warning: There\'s no GPU available on this machine,'
                'training will be performed on CPU.')
            n_gpu_use = 0
        if n_gpu_use > n_gpu:
            msg = ('Warning: The number of GPU\'s configured to use is {}'
                   ', but only {} are available '
                   'on this machine.'.format(n_gpu_use, n_gpu))
            logger.warning(msg)
            n_gpu_use = n_gpu
        device = torch.device('cuda:0' if n_gpu_use > 0 else 'cpu')
        logger.debug('device: %s', device)
        list_ids = list(range(n_gpu_use))
        logger.debug('list_ids: %s', list_ids)
        return device, list_ids

    def _save_checkpoint(self, epoch, save_best=False):
        """Saving checkpoints."""
        arch = type(self.model).__name__

        # To accomodate the DataParallel model that adds the prefix "module"
        # to the parameters
        try:
            state_dict = self.model.module.state_dict()
        except AttributeError:
            state_dict = self.model.state_dict()

        state = {
            'arch': arch,
            'epoch': epoch,
            'state_dict': state_dict,
            'monitor_best': self.mnt_best,
            'config': self.config,
            'n_samples': self.n_samples,
            'n_steps': self.n_steps,
        }
        if self.include_optim_in_ckpts:
            state['optimizer'] = self.optimizer.state_dict()
            state['lr_scheduler'] = self.lr_scheduler.state_dict()

        filename = str(self.checkpoint_dir /
                       'checkpoint-epoch{}.pth'.format(epoch))
        filename_tmp = filename + '_'
        tic = time.time()
        logger.info('Saving checkpoint: %s ...', filename)
        torch.save(state, filename_tmp)
        os.rename(filename_tmp, filename)
        msg = f'Done in {time.time() - tic:.3f}s'
        logger.info(msg)
        if save_best:
            logger.info('Updating \'best\' checkpoint: %s ...', filename)
            best_path = str(self.checkpoint_dir / 'trained_model.pth')
            best_path_tmp = best_path + '_'
            torch.save(state, best_path_tmp)
            os.rename(best_path_tmp, best_path)
            msg = f'Done in {time.time() - tic:.3f}s'
            logger.info(msg)

    def _resume_last_checkpoint(self):
        checkpoint_path = get_last_checkpoint_path(self.exp_dir)
        self._resume_checkpoint(checkpoint_path)

    def match_checkpoint_to_model(self, checkpoint, model):
        """Adapt the loaded checkpoint so that is fits the current architecture."""

        modules = ['vid_bert.embeddings.position_embeddings.weight']

        for module in modules:
            if module in model and checkpoint[module].shape != model[
                    module].shape:
                padding = model[module].shape[0] - checkpoint[module].shape[0]
                padding_shape = list(model[module].shape)
                padding_shape[0] = padding
                device = checkpoint[module].device
                checkpoint[module] = torch.cat([
                    checkpoint[module],
                    torch.zeros(padding_shape, device=device)
                ], 0)
                logger.warning(
                    'Size mismatch for module %s fixed by zero padding',
                    module)

    def _resume_checkpoint(self, resume_path):
        """Resume from saved checkpoints."""
        self.resume_path = str(resume_path)
        logger.info('Loading checkpoint from: %s ...', self.resume_path)
        checkpoint = torch.load(self.resume_path, map_location=self.device)
        self.loaded_epoch = checkpoint['epoch']
        self.epoch = checkpoint['epoch']
        self.start_epoch = checkpoint['epoch'] + 1
        self.n_samples = checkpoint['n_samples']
        self.n_steps = checkpoint['n_steps']

        exp_dir_src = os.path.dirname(self.resume_path)
        restart = exp_dir_src == str(self.exp_dir)

        # load architecture params from checkpoint.
        if checkpoint['config']['arch'] != self.config['arch']:
            msg = (
                'Warning: Architecture configuration given in config file is'
                'different from that of checkpoint. This may yield an exception'
                ' while state_dict is being loaded.')
            logger.warning(msg)
            logger.warning('Created model conf: %s', self.config['arch'])
            logger.warning('Loaded model conf: %s',
                           checkpoint['config']['arch'])
        self.match_checkpoint_to_model(checkpoint['state_dict'],
                                       self.model.state_dict())
        self.model.load_state_dict(checkpoint['state_dict'], strict=restart)

        if restart:
            # load optimizer state from ckpt only when optimizer type is not changed.
            optim_args = checkpoint['config']['optimizer']
            if optim_args['type'] != self.config['optimizer']['type']:
                msg = (
                    'Warning: Optimizer type given in config file differs from that'
                    ' of checkpoint. Optimizer parameters not being resumed.')
                logger.warning(msg)
            else:
                self.optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler_args = checkpoint['config']['lr_scheduler']
            if lr_scheduler_args['type'] != self.config['lr_scheduler']['type']:
                msg = (
                    'Warning: Lr_scheduler type given in config file differs from that'
                    ' of checkpoint. Lr_scheduler parameters not being resumed.'
                )
                logger.warning(msg)
            else:
                self.lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            self.mnt_best = checkpoint['monitor_best']
        else:
            self.loaded_epoch = 0
            self.epoch = 0
            self.start_epoch = 0
            self.n_samples = 0
            self.n_steps = 0

            # Log the path of the checkpoint that was loaded
            with open(self.info_checkpoint_path, 'a') as f:
                f.write(
                    f"This experiment is based on the checkpoint {self.resume_path}"
                    f"loaded at epoch {checkpoint['epoch']}")

        logger.info('Ckpt loaded at epoch %s.', str(checkpoint['epoch']))
Exemplo n.º 13
0
def train_model(args):
    logger.warn(
        "WARNING: TextAttack's model training feature is in beta. Please report any issues on our Github page, https://github.com/QData/TextAttack/issues."
    )
    start_time = time.time()
    make_directories(args.output_dir)

    num_gpus = torch.cuda.device_count()

    # Save logger writes to file
    log_txt_path = os.path.join(args.output_dir, "log.txt")
    fh = logging.FileHandler(log_txt_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)
    logger.info(f"Writing logs to {log_txt_path}.")

    # Use Weights & Biases, if enabled.
    if args.enable_wandb:
        wandb.init(sync_tensorboard=True)

    # Get list of text and list of label (integers) from disk.
    train_text, train_labels, eval_text, eval_labels = dataset_from_args(args)

    # Filter labels
    if args.allowed_labels:
        logger.info(
            f"Filtering samples with labels outside of {args.allowed_labels}.")
        final_train_text, final_train_labels = [], []
        for text, label in zip(train_text, train_labels):
            if label in args.allowed_labels:
                final_train_text.append(text)
                final_train_labels.append(label)
        logger.info(
            f"Filtered {len(train_text)} train samples to {len(final_train_text)} points."
        )
        train_text, train_labels = final_train_text, final_train_labels
        final_eval_text, final_eval_labels = [], []
        for text, label in zip(eval_text, eval_labels):
            if label in args.allowed_labels:
                final_eval_text.append(text)
                final_eval_labels.append(label)
        logger.info(
            f"Filtered {len(eval_text)} dev samples to {len(final_eval_text)} points."
        )
        eval_text, eval_labels = final_eval_text, final_eval_labels

    label_id_len = len(train_labels)
    label_set = set(train_labels)
    args.num_labels = len(label_set)
    logger.info(
        f"Loaded dataset. Found: {args.num_labels} labels: ({sorted(label_set)})"
    )

    if isinstance(train_labels[0], float):
        # TODO come up with a more sophisticated scheme for when to do regression
        logger.warn(f"Detected float labels. Doing regression.")
        args.num_labels = 1
        args.do_regression = True
    else:
        args.do_regression = False

    train_examples_len = len(train_text)

    if len(train_labels) != train_examples_len:
        raise ValueError(
            f"Number of train examples ({train_examples_len}) does not match number of labels ({len(train_labels)})"
        )
    if len(eval_labels) != len(eval_text):
        raise ValueError(
            f"Number of teste xamples ({len(eval_text)}) does not match number of labels ({len(eval_labels)})"
        )

    model = model_from_args(args, args.num_labels)
    tokenizer = model.tokenizer

    logger.info(f"Tokenizing training data. (len: {train_examples_len})")
    train_text_ids = batch_encode(tokenizer, train_text)
    logger.info(f"Tokenizing eval data (len: {len(eval_labels)})")
    eval_text_ids = batch_encode(tokenizer, eval_text)
    load_time = time.time()
    logger.info(f"Loaded data and tokenized in {load_time-start_time}s")

    # multi-gpu training
    if num_gpus > 1:
        model = torch.nn.DataParallel(model)
    logger.info(f"Training model across {num_gpus} GPUs")

    num_train_optimization_steps = (
        int(train_examples_len / args.batch_size / args.grad_accum_steps) *
        args.num_train_epochs)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.01,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    optimizer = transformers.optimization.AdamW(optimizer_grouped_parameters,
                                                lr=args.learning_rate)

    scheduler = transformers.optimization.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_proportion,
        num_training_steps=num_train_optimization_steps,
    )

    global_step = 0

    # Start Tensorboard and log hyperparams.
    from tensorboardX import SummaryWriter

    tb_writer = SummaryWriter(args.output_dir)

    def is_writable_type(obj):
        for ok_type in [bool, int, str, float]:
            if isinstance(obj, ok_type):
                return True
        return False

    args_dict = {k: v for k, v in vars(args).items() if is_writable_type(v)}

    tb_writer.add_hparams(args_dict, {})

    # Start training
    logger.info("***** Running training *****")
    logger.info(f"\tNum examples = {train_examples_len}")
    logger.info(f"\tBatch size = {args.batch_size}")
    logger.info(f"\tMax sequence length = {args.max_length}")
    logger.info(f"\tNum steps = {num_train_optimization_steps}")
    logger.info(f"\tNum epochs = {args.num_train_epochs}")
    logger.info(f"\tLearning rate = {args.learning_rate}")

    train_input_ids = np.array(train_text_ids)
    train_labels = np.array(train_labels)
    train_data = list(
        (ids, label) for ids, label in zip(train_input_ids, train_labels))
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size)

    eval_input_ids = np.array(eval_text_ids)
    eval_labels = np.array(eval_labels)
    eval_data = list(
        (ids, label) for ids, label in zip(eval_input_ids, eval_labels))
    eval_sampler = RandomSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size)

    def get_eval_score():
        model.eval()
        correct = 0
        total = 0
        logits = []
        labels = []
        for input_ids, batch_labels in eval_dataloader:
            if isinstance(input_ids, dict):
                ## HACK: dataloader collates dict backwards. This is a temporary
                # workaround to get ids in the right shape
                input_ids = {
                    k: torch.stack(v).T.to(device)
                    for k, v in input_ids.items()
                }
            batch_labels = batch_labels.to(device)

            with torch.no_grad():
                batch_logits = textattack.shared.utils.model_predict(
                    model, input_ids)

            logits.extend(batch_logits.cpu().squeeze().tolist())
            labels.extend(batch_labels)

        model.train()
        logits = torch.tensor(logits)
        labels = torch.tensor(labels)

        if args.do_regression:
            pearson_correlation, pearson_p_value = scipy.stats.pearsonr(
                logits, labels)
            return pearson_correlation
        else:
            preds = logits.argmax(dim=1)
            correct = (preds == labels).sum()
            return float(correct) / len(labels)

    def save_model():
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Only save the model itself

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, args.weights_name)
        output_config_file = os.path.join(args.output_dir, args.config_name)

        torch.save(model_to_save.state_dict(), output_model_file)
        try:
            model_to_save.config.to_json_file(output_config_file)
        except AttributeError:
            # no config
            pass

    global_step = 0

    def save_model_checkpoint():
        # Save model checkpoint
        output_dir = os.path.join(args.output_dir,
                                  "checkpoint-{}".format(global_step))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        # Take care of distributed/parallel training
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(output_dir)
        torch.save(args, os.path.join(output_dir, "training_args.bin"))
        logger.info(f"Checkpoint saved to {output_dir}.")

    model.train()
    args.best_eval_score = 0
    args.best_eval_score_epoch = 0
    args.epochs_since_best_eval_score = 0

    def loss_backward(loss):
        if num_gpus > 1:
            loss = loss.mean(
            )  # mean() to average on multi-gpu parallel training
        if args.grad_accum_steps > 1:
            loss = loss / args.grad_accum_steps
        loss.backward()
        return loss

    for epoch in tqdm.trange(int(args.num_train_epochs),
                             desc="Epoch",
                             position=0,
                             leave=False):
        prog_bar = tqdm.tqdm(train_dataloader,
                             desc="Iteration",
                             position=1,
                             leave=False)
        for step, batch in enumerate(prog_bar):
            input_ids, labels = batch
            labels = labels.to(device)
            if isinstance(input_ids, dict):
                ## HACK: dataloader collates dict backwards. This is a temporary
                # workaround to get ids in the right shape
                input_ids = {
                    k: torch.stack(v).T.to(device)
                    for k, v in input_ids.items()
                }
            logits = textattack.shared.utils.model_predict(model, input_ids)

            if args.do_regression:
                # TODO integrate with textattack `metrics` package
                loss_fct = torch.nn.MSELoss()
                loss = loss_fct(logits.squeeze(), labels.squeeze())
            else:
                loss_fct = torch.nn.CrossEntropyLoss()
                loss = loss_fct(logits, labels)
            loss = loss_backward(loss)

            if global_step % args.tb_writer_step == 0:
                tb_writer.add_scalar("loss", loss.item(), global_step)
                tb_writer.add_scalar("lr",
                                     scheduler.get_last_lr()[0], global_step)
            prog_bar.set_description(f"Loss {loss.item()}")
            if (step + 1) % args.grad_accum_steps == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            # Save model checkpoint to file.
            if (global_step > 0 and (args.checkpoint_steps > 0)
                    and (global_step % args.checkpoint_steps) == 0):
                save_model_checkpoint()

            model.zero_grad()

            # Inc step counter.
            global_step += 1

        # Check accuracy after each epoch.
        eval_score = get_eval_score()
        tb_writer.add_scalar("epoch_eval_score", eval_score, global_step)

        if args.checkpoint_every_epoch:
            save_model_checkpoint()

        logger.info(
            f"Eval {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%"
        )
        if eval_score > args.best_eval_score:
            args.best_eval_score = eval_score
            args.best_eval_score_epoch = epoch
            args.epochs_since_best_eval_score = 0
            save_model()
            logger.info(f"Best acc found. Saved model to {args.output_dir}.")
        else:
            args.epochs_since_best_eval_score += 1
            if (args.early_stopping_epochs >
                    0) and (args.epochs_since_best_eval_score >
                            args.early_stopping_epochs):
                logger.info(
                    f"Stopping early since it's been {args.early_stopping_epochs} steps since validation acc increased"
                )
                break

    # end of training, save tokenizer
    try:
        tokenizer.save_pretrained(args.output_dir)
        logger.info(f"Saved tokenizer {tokenizer} to {args.output_dir}.")
    except AttributeError:
        logger.warn(
            f"Error: could not save tokenizer {tokenizer} to {args.output_dir}."
        )

    # Save a little readme with model info
    write_readme(args, args.best_eval_score, args.best_eval_score_epoch)

    # Save args to file
    args_save_path = os.path.join(args.output_dir, "train_args.json")
    final_args_dict = {
        k: v
        for k, v in vars(args).items() if is_writable_type(v)
    }
    with open(args_save_path, "w", encoding="utf-8") as f:
        f.write(json.dumps(final_args_dict, indent=2) + "\n")
    logger.info(f"Wrote training args to {args_save_path}.")
              "translation max ratio": TRANSLATION_MAX_RATIO,
              "scale range": SCALE_RANGE,
              # training
              "num epochs": NUM_EPOCHS,
              "trainable stem": TRAINABLE_STEM,
              "train batch size": TRAIN_BATCH_SIZE,
              "batchnorm momentum": BATCHNORM_MOMENTUM,
              **OPT_INIT_PARAMS,
              "train HW": TRAIN_HW,
              "minival_gt_stddevs": MINIVAL_GT_STDDEVS,
              "val_gt_stddevs": VAL_GT_STDDEVS,
              "train_gt_stddevs": TRAIN_GT_STDDEVS,
              "distillation_alpha": DISTILLATION_ALPHA,
              **SCHEDULER_HYPERPARS}
HPARS_DICT = {str(k): str(v) for k, v in HPARS_DICT.items()}
tb_logger.add_hparams(HPARS_DICT, {})
txt_logger.info("HYPERPARAMETERS:\n{}".format(HPARS_DICT))


# INSTANTIATE OPTIMIZER
DET_POS_WEIGHT = 100  # 100 means that black happens 100 more times than white
det_loss_fn = DistillationBceLossKeypointMining(DET_POS_WEIGHT, DET_POS_WEIGHT, DEVICE)
# att_loss_fn = torch.nn.BCELoss(pos_weight=torch.ones(1)*7).to(DEVICE) THIS SHOULD BE THE LOSS TO USE BUT DOESNT HAVE POS_WEIGHT AND THE OTHER WORKS AMD THE GPU IS BLOATED, SO WE KEEP WITH LOGITS ATM ALTHOUGH WE PROVIDE SIGMOID.
att_loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=torch.ones(1)*7).to(DEVICE)
# If stem is not trainable it already has torch.no_grad so opt won't train it
params = (# list(student.mid_stem.parameters()) +
          list(student.att_lo.parameters()) +
          list(student.att_mid.parameters()) +
          list(student.att_hi.parameters()) +
          list(student.att_top.parameters()))
att_opt = get_sgd_optimizer(params, half_precision=HALF_PRECISION,
def train_classifier(layer, batch_size, n_epochs, bottleneck, data_str,
                     save_str):
    transform = transforms.Compose(
        [transforms.Grayscale(), transforms.ToTensor()])
    dataset = datasets.ImageFolder(data_str + '/' + str(layer),
                                   transform=transform)
    device = 'cuda'
    writer = SummaryWriter()
    validation_split = 0.1
    dataset_len = len(dataset)
    indices = list(range(dataset_len))
    data_save_root = save_str + '/' + str(layer) + "/"
    if not os.path.exists(data_save_root):
        os.makedirs(data_save_root)

    # Randomly splitting indices:
    val_len = int(np.floor(validation_split * dataset_len))
    validation_idx = np.random.choice(indices, size=val_len, replace=False)
    train_idx = list(set(indices) - set(validation_idx))

    ## Defining the samplers for each phase based on the random indices:
    train_sampler = SubsetRandomSampler(train_idx)
    validation_sampler = SubsetRandomSampler(validation_idx)

    train_loader = torch.utils.data.DataLoader(dataset,
                                               sampler=train_sampler,
                                               batch_size=batch_size)
    validation_loader = torch.utils.data.DataLoader(dataset,
                                                    sampler=validation_sampler,
                                                    batch_size=batch_size)
    data_loaders = {"train": train_loader, "valid": validation_loader}
    data_lengths = {"train": len(train_idx), "valid": val_len}

    classifier = FeatureClassifier(layer, bottleneck).to(device)
    criterion = nn.CrossEntropyLoss()
    criterion.to(device)
    lr = 0.0001
    optimizer = optim.Adam(classifier.parameters(), lr=0.0001)

    hparam_dict = {
        "Layer": layer,
        "batch size": batch_size,
        "Learning rate": lr
    }
    optimizer.zero_grad()
    writer.add_hparams(hparam_dict, {})
    total_it = 0
    for epoch in range(n_epochs):
        # Each epoch has a training and validation phase
        for phase in ['train', 'valid']:
            if phase == 'train':
                classifier.train(True)  # Set model to training mode
            else:
                optimizer.zero_grad()
            running_loss = 0.0
            epoch_it = 0
            for image, label in data_loaders[phase]:
                classifier.zero_grad()
                optimizer.zero_grad()
                image = image.to(device)
                norm_image = (image - 0.5) * 2
                label = label.to(device)
                vec, x_prob = classifier(norm_image)
                loss = criterion(x_prob, label)
                loss = loss.to(device)
                running_loss += loss.detach()
                if phase == 'train':
                    print("layer: " + str(layer) + ", epoch: " + str(epoch) +
                          ", step: " + str(epoch_it).zfill(6) +
                          ", training loss: " + str(float(loss)))
                    writer.add_scalar('data/train_loss_continous', loss,
                                      total_it)
                    loss.backward()
                    optimizer.step()
                    total_it += 1  # optimizer = scheduler(optimizer, epoch)
                epoch_it += 1

            epoch_loss = running_loss / data_lengths[phase]
            if phase == 'train':
                print("Epoch: " + str(epoch).zfill(6) + ", train loss: " +
                      str(epoch_loss))
                writer.add_scalar('data/train_loss_epoch', epoch_loss, epoch)
            if phase == 'valid':
                print("Epoch: " + str(epoch).zfill(6) + ", valid loss: " +
                      str(epoch_loss))
                writer.add_scalar('data/valid_loss_epoch', epoch_loss, epoch)

    if epoch % 10 == 0:
        torch.save(
            classifier.state_dict(), data_save_root + '/classifier' +
            str(layer) + '_' + str(epoch) + '.pt')

    torch.save(classifier.state_dict(),
               data_save_root + '/classifier' + str(layer) + '_final.pt')
    writer.export_scalars_to_json(data_save_root + "all_scalars.json")
    writer.close()
Exemplo n.º 16
0
def train_model(args):
    logger.warn(
        "WARNING: TextAttack's model training feature is in beta. Please report any issues on our Github page, https://github.com/QData/TextAttack/issues."
    )
    _make_directories(args.output_dir)

    num_gpus = torch.cuda.device_count()

    # Save logger writes to file
    log_txt_path = os.path.join(args.output_dir, "log.txt")
    fh = logging.FileHandler(log_txt_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)
    logger.info(f"Writing logs to {log_txt_path}.")

    # Use Weights & Biases, if enabled.
    if args.enable_wandb:
        global wandb
        import wandb

        wandb.init(sync_tensorboard=True)

    # Get list of text and list of label (integers) from disk.
    train_text, train_labels, eval_text, eval_labels = dataset_from_args(args)

    # Filter labels
    if args.allowed_labels:
        train_text, train_labels = _filter_labels(train_text, train_labels,
                                                  args.allowed_labels)
        eval_text, eval_labels = _filter_labels(eval_text, eval_labels,
                                                args.allowed_labels)

    if args.pct_dataset < 1.0:
        logger.info(f"Using {args.pct_dataset*100}% of the training set")
        (train_text,
         train_labels), _ = _train_val_split(train_text,
                                             train_labels,
                                             split_val=1.0 - args.pct_dataset)
    train_examples_len = len(train_text)

    # data augmentation
    augmenter = augmenter_from_args(args)
    if augmenter:
        logger.info(f"Augmenting {len(train_text)} samples with {augmenter}")
        train_text, train_labels = _data_augmentation(train_text, train_labels,
                                                      augmenter)

    # label_id_len = len(train_labels)
    label_set = set(train_labels)
    args.num_labels = len(label_set)
    logger.info(
        f"Loaded dataset. Found: {args.num_labels} labels: ({sorted(label_set)})"
    )

    if isinstance(train_labels[0], float):
        # TODO come up with a more sophisticated scheme for knowing when to do regression
        logger.warn("Detected float labels. Doing regression.")
        args.num_labels = 1
        args.do_regression = True
    else:
        args.do_regression = False

    if len(train_labels) != len(train_text):
        raise ValueError(
            f"Number of train examples ({len(train_text)}) does not match number of labels ({len(train_labels)})"
        )
    if len(eval_labels) != len(eval_text):
        raise ValueError(
            f"Number of teste xamples ({len(eval_text)}) does not match number of labels ({len(eval_labels)})"
        )

    model_wrapper = model_from_args(args, args.num_labels)
    model = model_wrapper.model
    tokenizer = model_wrapper.tokenizer

    attackCls = attack_from_args(args)
    # We are adversarial training if the user specified an attack along with
    # the training args.
    adversarial_training = attackCls is not None

    # multi-gpu training
    if num_gpus > 1:
        model = torch.nn.DataParallel(model)
        model.tokenizer = model.module.tokenizer
        logger.info("Using torch.nn.DataParallel.")
    logger.info(f"Training model across {num_gpus} GPUs")

    num_train_optimization_steps = (
        int(train_examples_len / args.batch_size / args.grad_accum_steps) *
        args.num_train_epochs)

    if args.model == "lstm" or args.model == "cnn":

        def need_grad(x):
            return x.requires_grad

        optimizer = torch.optim.Adam(filter(need_grad, model.parameters()),
                                     lr=args.learning_rate)
        scheduler = None
    else:
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        optimizer = transformers.optimization.AdamW(
            optimizer_grouped_parameters, lr=args.learning_rate)

        scheduler = transformers.optimization.get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_proportion,
            num_training_steps=num_train_optimization_steps,
        )

    # Start Tensorboard and log hyperparams.
    from tensorboardX import SummaryWriter

    tb_writer = SummaryWriter(args.output_dir)

    # Save original args to file
    args_save_path = os.path.join(args.output_dir, "train_args.json")
    _save_args(args, args_save_path)
    logger.info(f"Wrote original training args to {args_save_path}.")

    tb_writer.add_hparams(
        {k: v
         for k, v in vars(args).items() if _is_writable_type(v)}, {})

    # Start training
    logger.info("***** Running training *****")
    if augmenter:
        logger.info(f"\tNum original examples = {train_examples_len}")
        logger.info(f"\tNum examples after augmentation = {len(train_text)}")
    else:
        logger.info(f"\tNum examples = {train_examples_len}")
    logger.info(f"\tBatch size = {args.batch_size}")
    logger.info(f"\tMax sequence length = {args.max_length}")
    logger.info(f"\tNum steps = {num_train_optimization_steps}")
    logger.info(f"\tNum epochs = {args.num_train_epochs}")
    logger.info(f"\tLearning rate = {args.learning_rate}")

    eval_dataloader = _make_dataloader(tokenizer, eval_text, eval_labels,
                                       args.batch_size)
    train_dataloader = _make_dataloader(tokenizer, train_text, train_labels,
                                        args.batch_size)

    global_step = 0
    tr_loss = 0

    model.train()
    args.best_eval_score = 0
    args.best_eval_score_epoch = 0
    args.epochs_since_best_eval_score = 0

    def loss_backward(loss):
        if num_gpus > 1:
            loss = loss.mean(
            )  # mean() to average on multi-gpu parallel training
        if args.grad_accum_steps > 1:
            loss = loss / args.grad_accum_steps
        loss.backward()
        return loss

    if args.do_regression:
        # TODO integrate with textattack `metrics` package
        loss_fct = torch.nn.MSELoss()
    else:
        loss_fct = torch.nn.CrossEntropyLoss()

    for epoch in tqdm.trange(int(args.num_train_epochs),
                             desc="Epoch",
                             position=0,
                             leave=False):
        if adversarial_training:
            if epoch >= args.num_clean_epochs:
                if (epoch - args.num_clean_epochs) % args.attack_period == 0:
                    # only generate a new adversarial training set every args.attack_period epochs
                    # after the clean epochs
                    logger.info(
                        "Attacking model to generate new training set...")
                    adv_train_text = _generate_adversarial_examples(
                        model, attackCls, list(zip(train_text, train_labels)))
                    train_dataloader = _make_dataloader(
                        tokenizer, adv_train_text, train_labels,
                        args.batch_size)
            else:
                logger.info(
                    f"Running clean epoch {epoch+1}/{args.num_clean_epochs}")

        prog_bar = tqdm.tqdm(train_dataloader,
                             desc="Iteration",
                             position=1,
                             leave=False)
        for step, batch in enumerate(prog_bar):
            input_ids, labels = batch
            labels = labels.to(device)
            if isinstance(input_ids, dict):
                ## dataloader collates dict backwards. This is a workaround to get
                # ids in the right shape for HuggingFace models
                input_ids = {
                    k: torch.stack(v).T.to(device)
                    for k, v in input_ids.items()
                }
                logits = model(**input_ids)[0]
            else:

                input_ids = input_ids.to(device)
                logits = model(input_ids)

            if args.do_regression:
                # TODO integrate with textattack `metrics` package
                loss = loss_fct(logits.squeeze(), labels.squeeze())
            else:
                loss = loss_fct(logits, labels)
            loss = loss_backward(loss)
            tr_loss += loss.item()

            if global_step % args.tb_writer_step == 0:
                tb_writer.add_scalar("loss", loss.item(), global_step)
                if scheduler is not None:
                    tb_writer.add_scalar("lr",
                                         scheduler.get_last_lr()[0],
                                         global_step)
                else:
                    tb_writer.add_scalar("lr", args.learning_rate, global_step)
            if global_step > 0:
                prog_bar.set_description(f"Loss {tr_loss/global_step}")
            if (step + 1) % args.grad_accum_steps == 0:
                optimizer.step()
                if scheduler is not None:
                    scheduler.step()
                optimizer.zero_grad()
            # Save model checkpoint to file.
            if (global_step > 0 and (args.checkpoint_steps > 0)
                    and (global_step % args.checkpoint_steps) == 0):
                _save_model_checkpoint(model, args.output_dir, global_step)

            # Inc step counter.
            global_step += 1

        # Check accuracy after each epoch.
        # skip args.num_clean_epochs during adversarial training
        if not adversarial_training or epoch >= args.num_clean_epochs:
            eval_score = _get_eval_score(model, eval_dataloader,
                                         args.do_regression)
            tb_writer.add_scalar("epoch_eval_score", eval_score, global_step)

            if args.checkpoint_every_epoch:
                _save_model_checkpoint(model, args.output_dir,
                                       args.global_step)

            logger.info(
                f"Eval {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%"
            )
            if eval_score > args.best_eval_score:
                args.best_eval_score = eval_score
                args.best_eval_score_epoch = epoch
                args.epochs_since_best_eval_score = 0
                _save_model(model, args.output_dir, args.weights_name,
                            args.config_name)
                logger.info(
                    f"Best acc found. Saved model to {args.output_dir}.")
                _save_args(args, args_save_path)
                logger.info(f"Saved updated args to {args_save_path}")
            else:
                args.epochs_since_best_eval_score += 1
                if (args.early_stopping_epochs >
                        0) and (args.epochs_since_best_eval_score >
                                args.early_stopping_epochs):
                    logger.info(
                        f"Stopping early since it's been {args.early_stopping_epochs} steps since validation acc increased"
                    )
                    break

    # read the saved model and report its eval performance
    logger.info(
        "Finished training. Re-loading and evaluating model from disk.")
    model_wrapper = model_from_args(args, args.num_labels)
    model = model_wrapper.model
    model.load_state_dict(
        torch.load(os.path.join(args.output_dir, args.weights_name)))
    eval_score = _get_eval_score(model, eval_dataloader, args.do_regression)
    logger.info(
        f"Saved model {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%"
    )

    if args.save_last:
        _save_model(model, args.output_dir, args.weights_name,
                    args.config_name)

    # end of training, save tokenizer
    try:
        tokenizer.save_pretrained(args.output_dir)
        logger.info(f"Saved tokenizer {tokenizer} to {args.output_dir}.")
    except AttributeError:
        logger.warn(
            f"Error: could not save tokenizer {tokenizer} to {args.output_dir}."
        )

    # Save a little readme with model info
    write_readme(args, args.best_eval_score, args.best_eval_score_epoch)

    _save_args(args, args_save_path)
    logger.info(f"Wrote final training args to {args_save_path}.")
Exemplo n.º 17
0
class CustomTensorBoardCallback(transformers.trainer_callback.TrainerCallback):
    """
    A :class:`~transformers.TrainerCallback` that sends the logs to `TensorBoard
    <https://www.tensorflow.org/tensorboard>`__.

    Args:
        tb_writer (:obj:`SummaryWriter`, `optional`):
            The writer to use. Will instantiate one if not set.
    """
    def __init__(self, tb_writer=None):
        self.tb_writer = tb_writer

    def _init_summary_writer(self, args, log_dir=None):
        log_dir = log_dir or args.logging_dir
        self.tb_writer = SummaryWriter(log_dir=log_dir)

    def on_train_begin(self, args, state, control, **kwargs):
        if not state.is_world_process_zero:
            return

        log_dir = None

        if state.is_hyper_param_search:
            trial_name = state.trial_name
            if trial_name is not None:
                log_dir = os.path.join(args.logging_dir, trial_name)

        self._init_summary_writer(args, log_dir)

        if self.tb_writer is not None:
            self.tb_writer.add_text("args", args.to_json_string())
            if "model" in kwargs:
                model = kwargs["model"]
                if hasattr(model, "config") and model.config is not None:
                    model_config_json = model.config.to_json_string()
                    self.tb_writer.add_text("model_config", model_config_json)
            # Version of TensorBoard coming from tensorboardX does not have this method.
            if hasattr(self.tb_writer, "add_hparams"):
                self.tb_writer.add_hparams(args.to_sanitized_dict(),
                                           metric_dict={})

    def on_log(self, args, state, control, logs=None, **kwargs):

        logs = rewrite_logs(logs)
        logs.update(get_system_info())

        if state.is_world_process_zero:
            if self.tb_writer is None:
                self._init_summary_writer(args)

        if self.tb_writer:
            for k, v in logs.items():
                if isinstance(v, (int, float)):
                    self.tb_writer.add_scalar(k, v, state.global_step)
                else:
                    logger.warning(
                        "Trainer is attempting to log a value of "
                        '"%s" of type %s for key "%s" as a scalar. '
                        "This invocation of Tensorboard's writer.add_scalar() "
                        "is incorrect so we dropped this attribute.",
                        v,
                        type(v),
                        k,
                    )
            self.tb_writer.flush()

    def on_train_end(self, args, state, control, **kwargs):
        if self.tb_writer:
            self.tb_writer.close()
Exemplo n.º 18
0
class Logger(object):
    """Class that implements a logger of statistics.

    Parameters
    ----------
    name: str
        Name of logger. This create a folder at runs/`name'.
    comment: str, optional.
        This is useful to separate equivalent runs.
        The folder is runs/`name'/`comment_date'.
    tensorboard: bool, optional.
        Flag that indicates whether or not to save the results in the tensorboard.
    """

    def __init__(self, name, comment="", tensorboard=False):
        self.statistics = list()
        self.current = dict()
        self.all = defaultdict(list)

        now = datetime.now()
        current_time = now.strftime("%b%d_%H-%M-%S")
        comment = comment + "_" + current_time if len(comment) else current_time
        log_dir = f"runs/{name}/{comment}"
        if tensorboard:
            self.writer = SummaryWriter(log_dir=log_dir)
            self.log_dir = self.writer.logdir
        else:
            self.writer = None
            self.log_dir = safe_make_dir(log_dir)
        self.episode = 0
        self.keys = set()

    def __len__(self):
        """Return the number of episodes."""
        return len(self.statistics)

    def __iter__(self):
        """Iterate over the episode statistics."""
        return self.statistics

    def __getitem__(self, index):
        """Return a specific episode."""
        return self.statistics[index]

    def __str__(self):
        """Return parameter string of logger."""
        str_ = ""
        for key in sorted(self.keys):
            values = self.get(key)
            str_ += " ".join(key.split("_")).title().ljust(17)
            str_ += f"Last: {values[-1]:.2g}".ljust(15)
            str_ += f"Avg: {np.mean(values):.2g}".ljust(15)
            str_ += f"MAvg: {np.mean(values[-10:]):.2g}".ljust(15)
            str_ += f"Range: ({np.min(values):.2g},{np.max(values):.2g})\n"

        return str_

    def get(self, key):
        """Return the statistics of a specific key.

        It collects all end-of-episode data stored in statistic and returns a list with
        such values.
        """
        return [statistic[key] for statistic in self.statistics if key in statistic]

    def update(self, **kwargs):
        """Update the statistics for the current episode.

        Parameters
        ----------
        kwargs: dict
            Any kwargs passed to update is converted to numpy and averaged
            over the course of an episode.
        """
        for key, value in kwargs.items():
            self.keys.add(key)
            if isinstance(value, torch.Tensor):
                value = value.detach().numpy()
            value = np.nan_to_num(value)
            if isinstance(value, np.ndarray):
                value = float(np.mean(value))
            if isinstance(value, np.float32):
                value = float(value)
            if isinstance(value, np.int64):
                value = int(value)

            if key not in self.current:
                self.current[key] = (1, value)
            else:
                count, old_value = self.current[key]
                new_count = count + 1
                new_value = old_value + (value - old_value) * (1 / new_count)
                self.current[key] = (new_count, new_value)

            self.all[key].append(value)

            if self.writer is not None:
                self.writer.add_scalar(
                    f"episode_{self.episode}/{key}",
                    self.current[key][1],
                    global_step=self.current[key][0],
                )

    def end_episode(self, **kwargs):
        """Finalize collected data and add final fixed values.

        Parameters
        ----------
        kwargs : dict
            Any kwargs passed to end_episode overwrites tracked data if present.
            This can be used to store fixed values that are tracked per episode
            and do not need to be averaged.
        """
        data = {key: value[1] for key, value in self.current.items()}
        kwargs = {key: value for key, value in kwargs.items()}
        data.update(kwargs)

        for key, value in data.items():
            self.keys.add(key)
            if isinstance(value, float) or isinstance(value, int):
                self.all[key].append(value)
                if self.writer is not None:
                    self.writer.add_scalar(
                        f"average/{key}", value, global_step=self.episode
                    )

        self.statistics.append(data)
        self.current = dict()
        self.episode += 1

    def save_hparams(self, hparams):
        """Save hparams to a json file."""
        with open(f"{self.log_dir}/hparams.json", "w") as f:
            json.dump(hparams, f)

    def export_to_json(self):
        """Save the statistics to a json file."""
        with open(f"{self.log_dir}/statistics.json", "w") as f:
            json.dump(self.statistics, f)
        with open(f"{self.log_dir}/all.json", "w") as f:
            json.dump(self.all, f)

    def load_from_json(self, log_dir=None):
        """Load the statistics from a json file."""
        log_dir = log_dir if log_dir is not None else self.log_dir

        with open(f"{log_dir}/statistics.json", "r") as f:
            self.statistics = json.load(f)
        with open(f"{log_dir}/all.json", "r") as f:
            self.all = json.load(f)
        for key in self.all.keys():
            self.keys.add(key)

    def log_hparams(self, hparams, metrics=None):
        """Log hyper parameters together with a metric dictionary."""
        if self.writer is None:  # Do not save.
            return
        for k, v in hparams.items():
            if v is None:
                hparams[k] = 0
        self.writer.add_hparams(
            hparam_dict=hparams, metric_dict=metrics, name="hparams", global_step=1
        )

    def delete_directory(self):
        """Delete writer directory.

        Notes
        -----
        Use with caution. This will erase the directory, not the object.
        """
        shutil.rmtree(self.log_dir)

    def change_log_dir(self, new_log_dir):
        """Change log directory."""
        log_dir = f"runs/{new_log_dir}"
        try:
            self.delete_directory()
        except FileNotFoundError:
            pass
        if self.writer is not None:
            self.writer = SummaryWriter(log_dir=log_dir)
            self.log_dir = self.writer.logdir
        else:
            self.writer = None
            self.log_dir = safe_make_dir(log_dir)

        try:
            self.load_from_json()  # If json files in log_dir, then load them.
        except FileNotFoundError:
            pass
Exemplo n.º 19
0
class Logger(BaseLogger):
    """Logger class that writes to tensorboardX."""
    def __init__(self, config: ConfigType):
        """Initialise the tensorboardX Logger.

        Args:
            config (ConfigType): config to initialise the tensorboardX
                logger. The config can have any parameters that
                tensorboardX.SummaryWriter() method accepts
                (https://tensorboardx.readthedocs.io/en/latest/tensorboard.html#tensorboardX.SummaryWriter).
                Note that the config is passed as keyword arguments to the
                tensorboardX.SummaryWriter() method. This provides a lot
                of flexibility to the users to configure tensorboard. This also
                means that config should not have any parameters that
                tensorboardX.SummaryWriter() would not accept.
        """
        super().__init__(config=config)
        key = "logdir"
        if key in config and config[key] is not None:
            make_dir(config[key])
        self.summary_writer = SummaryWriter(**config)
        self.keys_to_skip = ["logbook_id", "logbook_type", "logbook_timestamp"]

    def write(self, log: LogType) -> None:
        """Write the log to tensorboard.

        Args:
            log (LogType): Log to write
        """
        logbook_type = log["logbook_type"]
        if logbook_type == "metric":
            log = self._prepare_metric_log_to_write(log=log)
            self.write_metric(metric=log)
        else:
            if logbook_type == "config":
                self.write_config(config=log)
            # Only metric logs and message logs are supported right now

    def write_metric(self, metric: MetricType) -> None:
        """Write metric to tensorboard.

        Args:
            metric (MetricType): Metric to write
        """
        global_step = None
        if "global_step" in metric:
            global_step = metric.pop("global_step")
        walltime = None
        if "walltime" in metric:
            walltime = metric.pop("walltime")

        main_tag = ""
        if "tag" in metric:
            main_tag = str(metric.pop("tag")) + "/"
        elif "main_tag" in metric:
            main_tag = str(metric.pop("main_tag")) + "/"

        if self.key_prefix:
            prefix = {metric.pop(self.key_prefix)}
            metric = {
                f"{prefix}_{key}": value
                for key, value in metric.items()
            }

        for key, value in metric.items():
            self.summary_writer.add_scalar(
                tag=f"{main_tag}{key}",
                scalar_value=value,
                global_step=global_step,
                walltime=walltime,
            )

    def write_config(self, config: ConfigType) -> None:
        """Write the config to tensorboard.

        Args:
            config (ConfigType): Config to write
        """
        name = None
        if "name" in config:
            name = config.pop("name")

        metric_dict: Dict[str, NumType] = {}
        if "metric_dict" in config:
            metric_dict = config.pop("metric_dict")
            metric_dict = self._prepare_metric_log_to_write(log=metric_dict)

        global_step = None
        if "global_step" in config:
            global_step = config.pop("global_step")

        config = self._prepare_log_to_write(log=config)

        for key in config:
            if config[key] is None:
                config[key] = "None"

        self.summary_writer.add_hparams(
            hparam_dict=flatten_dict(config),
            metric_dict=metric_dict,
            name=name,
            global_step=global_step,
        )
Exemplo n.º 20
0
        # validation_logger = SummaryWriter(log_dir = os.path.join(args.save, 'validation'), comment = 'validation')

        log_subpath = f'log/'
        unified_logger = SummaryWriter(
            log_dir=os.path.join(args.save, log_subpath),
            comment=f'{args.model}_{args.optimizer}_{args.loss}')
        hyper_param_dict = {
            'lr': args.optimizer_lr,
            'bsize': args.batch_size,
            'epochs': args.total_epochs,
            'sched_frac': args.schedule_lr_fraction,
            'sched_freq': args.schedule_lr_frequency,
            # 'eps': args.optimizer_eps
        }
        metric_param_dict = {}
        unified_logger.add_hparams(hyper_param_dict, metric_param_dict)
        # unified_logger.add_text('train/summary', args.model)
        # unified_logger.add_text('train/summary', args.optimizer)
        # unified_logger.add_text('train/summary', args.loss)
    # Dynamically load the optimizer with parameters passed in via "--optimizer_[param]=[value]" arguments
    with tools.TimerBlock("Initializing {} Optimizer".format(
            args.optimizer)) as block:
        kwargs = tools.kwargs_from_args(args, 'optimizer')
        if args.fp16:
            optimizer = args.optimizer_class(
                [p for p in param_copy if p.requires_grad], **kwargs)
        else:
            optimizer = args.optimizer_class(
                [p for p in model_and_loss.parameters() if p.requires_grad],
                **kwargs)
        for param, default in list(kwargs.items()):
 def model_train_dev(self,
                     train_dataset: Dataset,
                     dev_dataset: Dataset,
                     model_dir: str,
                     epoch_dev_eval: bool = False,
                     **kwargs):
     log_dir = os.path.join(model_dir, 'log')
     if self.cur_epoch == 0: shutil.rmtree(log_dir, ignore_errors=True)
     writer = SummaryWriter(log_dir, flush_secs=60)
     num_workers = kwargs.get('num_workers', 8)
     dt_train_src, dt_train_tgt = train_dataset
     dl_train_src = DataLoader(dt_train_src,
                               batch_size=self.batch_size,
                               shuffle=True,
                               num_workers=num_workers)
     dl_train_tgt = DataLoader(dt_train_tgt,
                               batch_size=self.batch_size,
                               shuffle=True,
                               num_workers=num_workers)
     dev_eval_dl = DataLoader(dev_dataset,
                              batch_size=self.batch_size,
                              shuffle=False,
                              num_workers=4)
     self.core_model.train()
     for epoch in range(self.cur_epoch + 1,
                        self.epoch_s1 + self.epoch_s2 + 1):
         epoch_start_time = time.time()
         self.cur_epoch = epoch
         running_losses = []
         if epoch <= self.epoch_s1:
             for batch_data in dl_train_src:
                 losses = self.core_model.batch_fit(batch=batch_data,
                                                    epoch=epoch)
                 running_losses.append(losses)
         else:
             for batch_data in dl_train_tgt:
                 losses = self.core_model.batch_fit(batch=batch_data,
                                                    epoch=epoch)
                 running_losses.append(losses)
         show_loss = pd.DataFrame(running_losses).mean().to_dict()
         print(f"[epoch: {epoch:03d}/{self.epochs}, %s" % (', '.join(
             [f'{loss}: {value:.5f}'
              for loss, value in show_loss.items()])))
         for loss_name, loss in show_loss.items():
             writer.add_scalar(tag=f'loss_train/{loss_name}',
                               scalar_value=loss,
                               global_step=epoch)
         running_losses = pd.DataFrame(running_losses).mean().to_dict()
         if epoch_dev_eval and self.cur_epoch % self.eval_epoch_freq == 0:
             results_dev = []
             self.core_model.eval()
             with torch.no_grad():
                 for batch_data in dev_eval_dl:
                     batch_result = self.core_model.batch_predict(
                         batch_data)
                     results_dev.append(batch_result)
             self.core_model.train()
             results_dev = pd.DataFrame(results_dev).to_dict(orient='list')
             results_dev = {
                 k: np.concatenate(v)
                 for k, v in results_dev.items()
             }
             metrics_dev = yield results_dev
             print(
                 f"\t dev performance of epoch {epoch}: {[f'{k}:{v:.3f}' for k, v in metrics_dev.items()]}"
             )
             writer.add_hparams(hparam_dict={'set': 'val'},
                                metric_dict={
                                    'metric/' + k: v
                                    for k, v in metrics_dev.items()
                                },
                                name='metric_val',
                                global_step=epoch)
         else:
             metrics_dev = {}
         self.core_model.after_one_epoch(writer=writer,
                                         epoch=epoch,
                                         losses=running_losses,
                                         metrics=metrics_dev)
         if self.cur_epoch % self.save_epoch_freq == 0:
             self.save_model(model_dir=model_dir, epoch=self.cur_epoch)
         print('\t Epoch: %03d Time Taken: %d sec' %
               (self.cur_epoch, time.time() - epoch_start_time))
     writer.close()
Exemplo n.º 22
0
def train_model(
    name="",
    resume="",
    base_dir=utils.BASE_DIR,
    model_name="v0",
    chosen_diseases=None,
    n_epochs=10,
    batch_size=4,
    oversample=False,
    max_os=None,
    shuffle=False,
    opt="sgd",
    opt_params={},
    loss_name="wbce",
    loss_params={},
    train_resnet=False,
    log_metrics=None,
    flush_secs=120,
    train_max_images=None,
    val_max_images=None,
    test_max_images=None,
    experiment_mode="debug",
    save=True,
    save_cms=True,  # Note that in this case, save_cms (to disk) includes write_cms (to TB)
    write_graph=False,
    write_emb=False,
    write_emb_img=False,
    write_img=False,
    image_format="RGB",
    multiple_gpu=False,
):

    # Choose GPU
    device = utilsT.get_torch_device()
    print("Using device: ", device)

    # Common folders
    dataset_dir = os.path.join(base_dir, "dataset")

    # Dataset handling
    print("Loading train dataset...")
    train_dataset, train_dataloader = utilsT.prepare_data(
        dataset_dir,
        "train",
        chosen_diseases,
        batch_size,
        oversample=oversample,
        max_os=max_os,
        shuffle=shuffle,
        max_images=train_max_images,
        image_format=image_format,
    )
    train_samples, _ = train_dataset.size()

    print("Loading val dataset...")
    val_dataset, val_dataloader = utilsT.prepare_data(
        dataset_dir,
        "val",
        chosen_diseases,
        batch_size,
        max_images=val_max_images,
        image_format=image_format,
    )
    val_samples, _ = val_dataset.size()

    # Should be the same than chosen_diseases
    chosen_diseases = list(train_dataset.classes)
    print("Chosen diseases: ", chosen_diseases)

    if resume:
        # Load model and optimizer
        model, model_name, optimizer, opt, loss_name, loss_params, chosen_diseases = models.load_model(
            base_dir, resume, experiment_mode="", device=device)
        model.train(True)
    else:
        # Create model
        model = models.init_empty_model(model_name,
                                        chosen_diseases,
                                        train_resnet=train_resnet).to(device)

        # Create optimizer
        OptClass = optimizers.get_optimizer_class(opt)
        optimizer = OptClass(model.parameters(), **opt_params)
        # print("OPT: ", opt_params)

    # Allow multiple GPUs
    if multiple_gpu:
        model = DataParallel(model)

    # Tensorboard log options
    run_name = utils.get_timestamp()
    if name:
        run_name += "_{}".format(name)

    if len(chosen_diseases) == 1:
        run_name += "_{}".format(chosen_diseases[0])
    elif len(chosen_diseases) == 14:
        run_name += "_all"

    log_dir = get_log_dir(base_dir, run_name, experiment_mode=experiment_mode)

    print("Run name: ", run_name)
    print("Saved TB in: ", log_dir)

    writer = SummaryWriter(log_dir=log_dir, flush_secs=flush_secs)

    # Create validator engine
    validator = Engine(
        utilsT.get_step_fn(model, optimizer, device, loss_name, loss_params,
                           False))

    val_loss = RunningAverage(output_transform=lambda x: x[0], alpha=1)
    val_loss.attach(validator, loss_name)

    utilsT.attach_metrics(validator, chosen_diseases, "prec", Precision, True)
    utilsT.attach_metrics(validator, chosen_diseases, "recall", Recall, True)
    utilsT.attach_metrics(validator, chosen_diseases, "acc", Accuracy, True)
    utilsT.attach_metrics(validator, chosen_diseases, "roc_auc",
                          utilsT.RocAucMetric, False)
    utilsT.attach_metrics(validator,
                          chosen_diseases,
                          "cm",
                          ConfusionMatrix,
                          get_transform_fn=utilsT.get_transform_cm,
                          metric_args=(2, ))
    utilsT.attach_metrics(validator,
                          chosen_diseases,
                          "positives",
                          RunningAverage,
                          get_transform_fn=utilsT.get_count_positives)

    # Create trainer engine
    trainer = Engine(
        utilsT.get_step_fn(model, optimizer, device, loss_name, loss_params,
                           True))

    train_loss = RunningAverage(output_transform=lambda x: x[0], alpha=1)
    train_loss.attach(trainer, loss_name)

    utilsT.attach_metrics(trainer, chosen_diseases, "acc", Accuracy, True)
    utilsT.attach_metrics(trainer, chosen_diseases, "prec", Precision, True)
    utilsT.attach_metrics(trainer, chosen_diseases, "recall", Recall, True)
    utilsT.attach_metrics(trainer, chosen_diseases, "roc_auc",
                          utilsT.RocAucMetric, False)
    utilsT.attach_metrics(trainer,
                          chosen_diseases,
                          "cm",
                          ConfusionMatrix,
                          get_transform_fn=utilsT.get_transform_cm,
                          metric_args=(2, ))
    utilsT.attach_metrics(trainer,
                          chosen_diseases,
                          "positives",
                          RunningAverage,
                          get_transform_fn=utilsT.get_count_positives)

    timer = Timer(average=True)
    timer.attach(trainer,
                 start=Events.EPOCH_STARTED,
                 step=Events.EPOCH_COMPLETED)

    # TODO: Early stopping
    #     def score_function(engine):
    #         val_loss = engine.state.metrics[loss_name]
    #         return -val_loss

    #     handler = EarlyStopping(patience=10, score_function=score_function, trainer=trainer)
    #     validator.add_event_handler(Events.COMPLETED, handler)

    # Metrics callbacks
    if log_metrics is None:
        log_metrics = list(ALL_METRICS)

    def _write_metrics(run_type, metrics, epoch, wall_time):
        loss = metrics.get(loss_name, 0)

        writer.add_scalar("Loss/" + run_type, loss, epoch, wall_time)

        for metric_base_name in log_metrics:
            for disease in chosen_diseases:
                metric_value = metrics.get(
                    "{}_{}".format(metric_base_name, disease), -1)
                writer.add_scalar(
                    "{}_{}/{}".format(metric_base_name, disease, run_type),
                    metric_value, epoch, wall_time)

    @trainer.on(Events.EPOCH_COMPLETED)
    def tb_write_metrics(trainer):
        epoch = trainer.state.epoch
        max_epochs = trainer.state.max_epochs

        # Run on evaluation
        validator.run(val_dataloader, 1)

        # Common time
        wall_time = time.time()

        # Log all metrics to TB
        _write_metrics("train", trainer.state.metrics, epoch, wall_time)
        _write_metrics("val", validator.state.metrics, epoch, wall_time)

        train_loss = trainer.state.metrics.get(loss_name, 0)
        val_loss = validator.state.metrics.get(loss_name, 0)

        tb_write_histogram(writer, model, epoch, wall_time)

        print("Finished epoch {}/{}, loss {:.3f}, val loss {:.3f} (took {})".
              format(epoch, max_epochs, train_loss, val_loss,
                     utils.duration_to_str(int(timer._elapsed()))))

    # Hparam dict
    hparam_dict = {
        "resume": resume,
        "n_diseases": len(chosen_diseases),
        "diseases": ",".join(chosen_diseases),
        "n_epochs": n_epochs,
        "batch_size": batch_size,
        "shuffle": shuffle,
        "model_name": model_name,
        "opt": opt,
        "loss": loss_name,
        "samples (train, val)": "{},{}".format(train_samples, val_samples),
        "train_resnet": train_resnet,
        "multiple_gpu": multiple_gpu,
    }

    def copy_params(params_dict, base_name):
        for name, value in params_dict.items():
            hparam_dict["{}_{}".format(base_name, name)] = value

    copy_params(loss_params, "loss")
    copy_params(opt_params, "opt")
    print("HPARAM: ", hparam_dict)

    # Train
    print("-" * 50)
    print("Training...")
    trainer.run(train_dataloader, n_epochs)

    # Capture time
    secs_per_epoch = timer.value()
    duration_per_epoch = utils.duration_to_str(int(secs_per_epoch))
    print("Average time per epoch: ", duration_per_epoch)
    print("-" * 50)

    ## Write all hparams
    hparam_dict["duration_per_epoch"] = duration_per_epoch

    # FIXME: this is commented to avoid having too many hparams in TB frontend
    # metrics
    #     def copy_metrics(engine, engine_name):
    #         for metric_name, metric_value in engine.state.metrics.items():
    #             hparam_dict["{}_{}".format(engine_name, metric_name)] = metric_value
    #     copy_metrics(trainer, "train")
    #     copy_metrics(validator, "val")

    print("Writing TB hparams")
    writer.add_hparams(hparam_dict, {})

    # Save model to disk
    if save:
        print("Saving model...")
        models.save_model(base_dir, run_name, model_name, experiment_mode,
                          hparam_dict, trainer, model, optimizer)

    # Write graph to TB
    if write_graph:
        print("Writing TB graph...")
        tb_write_graph(writer, model, train_dataloader, device)

    # Write embeddings to TB
    if write_emb:
        print("Writing TB embeddings...")
        image_size = 256 if write_emb_img else 0

        # FIXME: be able to select images (balanced, train vs val, etc)
        image_list = list(train_dataset.label_index["FileName"])[:1000]
        # disease = chosen_diseases[0]
        # positive = train_dataset.label_index[train_dataset.label_index[disease] == 1]
        # negative = train_dataset.label_index[train_dataset.label_index[disease] == 0]
        # positive_images = list(positive["FileName"])[:25]
        # negative_images = list(negative["FileName"])[:25]
        # image_list = positive_images + negative_images

        all_images, all_embeddings, all_predictions, all_ground_truths = gen_embeddings(
            model,
            train_dataset,
            device,
            image_list=image_list,
            image_size=image_size)
        tb_write_embeddings(
            writer,
            chosen_diseases,
            all_images,
            all_embeddings,
            all_predictions,
            all_ground_truths,
            global_step=n_epochs,
            use_images=write_emb_img,
            tag="1000_{}".format("img" if write_emb_img else "no_img"),
        )

    # Save confusion matrices (is expensive to calculate them afterwards)
    if save_cms:
        print("Saving confusion matrices...")
        # Assure folder
        cms_dir = os.path.join(base_dir, "cms", experiment_mode)
        os.makedirs(cms_dir, exist_ok=True)
        base_fname = os.path.join(cms_dir, run_name)

        n_diseases = len(chosen_diseases)

        def extract_cms(metrics):
            """Extract confusion matrices from a metrics dict."""
            cms = []
            for disease in chosen_diseases:
                key = "cm_" + disease
                if key not in metrics:
                    cm = np.array([[-1, -1], [-1, -1]])
                else:
                    cm = metrics[key].numpy()

                cms.append(cm)
            return np.array(cms)

        # Train confusion matrix
        train_cms = extract_cms(trainer.state.metrics)
        np.save(base_fname + "_train", train_cms)
        tb_write_cms(writer, "train", chosen_diseases, train_cms)

        # Validation confusion matrix
        val_cms = extract_cms(validator.state.metrics)
        np.save(base_fname + "_val", val_cms)
        tb_write_cms(writer, "val", chosen_diseases, val_cms)

        # All confusion matrix (train + val)
        all_cms = train_cms + val_cms
        np.save(base_fname + "_all", all_cms)

        # Print to console
        if len(chosen_diseases) == 1:
            print("Train CM: ")
            print(train_cms[0])
            print("Val CM: ")
            print(val_cms[0])


#             print("Train CM 2: ")
#             print(trainer.state.metrics["cm_" + chosen_diseases[0]])
#             print("Val CM 2: ")
#             print(validator.state.metrics["cm_" + chosen_diseases[0]])

    if write_img:
        # NOTE: this option is not recommended, use Testing notebook to plot and analyze images

        print("Writing images to TB...")

        test_dataset, test_dataloader = utilsT.prepare_data(
            dataset_dir,
            "test",
            chosen_diseases,
            batch_size,
            max_images=test_max_images,
        )

        # TODO: add a way to select images?
        # image_list = list(test_dataset.label_index["FileName"])[:3]

        # Examples in test_dataset (with bboxes available):
        image_list = [
            # "00010277_000.png", # (Effusion, Infiltrate, Mass, Pneumonia)
            # "00018427_004.png", # (Atelectasis, Effusion, Mass)
            # "00021703_001.png", # (Atelectasis, Effusion, Infiltrate)
            # "00028640_008.png", # (Effusion, Infiltrate)
            # "00019124_104.png", # (Pneumothorax)
            # "00019124_090.png", # (Nodule)
            # "00020318_007.png", # (Pneumothorax)
            "00000003_000.png",  # (0)
            # "00000003_001.png", # (0)
            # "00000003_002.png", # (0)
            "00000732_005.png",  # (Cardiomegaly, Pneumothorax)
            # "00012261_001.png", # (Cardiomegaly, Pneumonia)
            # "00013249_033.png", # (Cardiomegaly, Pneumonia)
            # "00029808_003.png", # (Cardiomegaly, Pneumonia)
            # "00022215_012.png", # (Cardiomegaly, Pneumonia)
            # "00011402_007.png", # (Cardiomegaly, Pneumonia)
            # "00019018_007.png", # (Cardiomegaly, Infiltrate)
            # "00021009_001.png", # (Cardiomegaly, Infiltrate)
            # "00013670_151.png", # (Cardiomegaly, Infiltrate)
            # "00005066_030.png", # (Cardiomegaly, Infiltrate, Effusion)
            "00012288_000.png",  # (Cardiomegaly)
            "00008399_007.png",  # (Cardiomegaly)
            "00005532_000.png",  # (Cardiomegaly)
            "00005532_014.png",  # (Cardiomegaly)
            "00005532_016.png",  # (Cardiomegaly)
            "00005827_000.png",  # (Cardiomegaly)
            # "00006912_007.png", # (Cardiomegaly)
            # "00007037_000.png", # (Cardiomegaly)
            # "00007043_000.png", # (Cardiomegaly)
            # "00012741_004.png", # (Cardiomegaly)
            # "00007551_020.png", # (Cardiomegaly)
            # "00007735_040.png", # (Cardiomegaly)
            # "00008339_010.png", # (Cardiomegaly)
            # "00008365_000.png", # (Cardiomegaly)
            # "00012686_003.png", # (Cardiomegaly)
        ]

        tb_write_images(writer, model, test_dataset, chosen_diseases, n_epochs,
                        device, image_list)

    # Close TB writer
    if experiment_mode != "debug":
        writer.close()

    # Run post_train
    print("-" * 50)
    print("Running post_train...")

    print("Loading test dataset...")
    test_dataset, test_dataloader = utilsT.prepare_data(
        dataset_dir,
        "test",
        chosen_diseases,
        batch_size,
        max_images=test_max_images)

    save_cms_with_names(run_name, experiment_mode, model, test_dataset,
                        test_dataloader, chosen_diseases)

    evaluate_model(run_name,
                   model,
                   optimizer,
                   device,
                   loss_name,
                   loss_params,
                   chosen_diseases,
                   test_dataloader,
                   experiment_mode=experiment_mode,
                   base_dir=base_dir)

    # Return values for debugging
    model_run = ModelRun(model, run_name, model_name, chosen_diseases)
    if experiment_mode == "debug":
        model_run.save_debug_data(writer, trainer, validator, train_dataset,
                                  train_dataloader, val_dataset,
                                  val_dataloader)

    return model_run
Exemplo n.º 23
0
# ----------------------------------------Written by Luc Hayward------------------------------------------------------ #
# Load the best saved model.
model_load(args.save)
print('Loaded best saved model')

# Added final evaluation on the validation and test sets using best saved model (regardless of the effect of
# over-fitting the training data, will reload the last best validation model).
# Logs the validation score to the hparams tensorboard log to allow for easy comparisons of the different parameter
# tuning experiments. Test values specifically not logged to prevent tuning on test results.

if args.log_hparams_only:
    stored_loss = evaluate(val_data, eval_batch_size)
writer.add_hparams(
    args.__dict__, {
        'hparam/val_loss':
        stored_loss,
        'hparam/val_bpc':
        stored_loss / math.log(2) /
        corpus.dictionary.avg_characters_per_token.get('valid')
    })

print("Evaluating on test data...")
# Run on test data.
test_loss = evaluate(test_data, test_batch_size)
print('=' * 89)

print(
    '| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}'
    .format(
        test_loss, math.exp(test_loss), test_loss / math.log(2) /
        corpus.dictionary.avg_characters_per_token.get('test')))
Exemplo n.º 24
0
class Trainer:
    def __init__(self, exp_name, model_name, dls, hp, bs, sched=False):
        self.device = torch.device("cuda" if IS_CUDA else "cpu")
        self.model = all_models[hp["model"]](bs).to(self.device)
        self.loss = all_loss[hp["loss"]]
        self.epochs = hp["epochs"]
        self.writer = SummaryWriter(
            os.path.join(LOG_DIR, exp_name, model_name))
        self.exp_name = exp_name
        self.hp = hp
        self.metrics = {}
        for p in ["train", "val", "test"]:
            self.metrics[p] = [[], [], []]
        self.dls = dls
        self.steps = [0] * 3
        self.batch_size = dls[0].batch_size
        self.model_name = model_name
        opt = all_opt[hp["opt"]]
        parameters = filter(lambda p: p.requires_grad, self.model.parameters())

        if hp["opt"] == "ADAM":
            self.opt = opt(params=parameters, lr=hp["lr"])
        else:
            self.opt = opt(
                params=parameters,
                lr=hp["lr"],
                momentum=0.9,
                weight_decay=hp["wd"],
            )
        self.scheduler = torch.optim.lr_scheduler.StepLR(
            self.opt, step_size=2, gamma=0.1 if sched else 1
        )
        self.isTransformer = dls[0].dataset.tokenizer is not None

    def anEpoch(self, phaseIndex, toLog=True):
        phaseName = PHASES[phaseIndex]
        losses = []
        acc_count = 0
        allPreds, allLabels = [], []
        # we use tqdm to provide visual feedback on training stage
        for xb, yb in tqdm(self.dls[phaseIndex], total=len(self.dls[phaseIndex])):
            if self.isTransformer:
                inputIds, mask = xb
                yb = yb.to(self.device)
                outputs = self.model(
                    inputIds.to(self.device),
                    attention_mask=mask.to(self.device),
                    labels=yb,
                )
                loss = outputs[0]
                output = outputs[1]
                inputIds.detach().cpu()
                mask.detach().cpu()
                yb.detach().cpu()
            else:
                # BATCH_SIZE, 3, 224, 224
                xb = (xb[0].to(self.device), xb[1].cpu())
                yb = yb.to(self.device)  # BATCH_SIZE, 1
                output = self.model(xb)  # BATCH_SIZE, 3
                loss = self.loss(output, yb)
                xb[0].detach().cpu()
                yb.detach().cpu()
            allPreds.append(torch.argmax(output, dim=1).cpu())
            allLabels.append(yb.cpu())

            acc_count += accuracy(output, yb)
            losses.append(loss)
            self.steps[phaseIndex] += 1
            if toLog:
                self._log("{}_loss".format(phaseName),
                          loss, self.steps[phaseIndex])

            if phaseIndex == 0:
                self.opt.zero_grad()
                loss.backward()  # calculates gradient descent
                self.opt.step()  # updates model parameters
        allPreds = torch.cat(allPreds)
        allLabels = torch.cat(allLabels)
        f1Score = skMetrics.f1_score(
            allLabels.cpu(), allPreds.cpu(), average="macro")
        losses = torch.stack(losses)
        epoch_loss = losses.mean().item()
        epoch_acc = acc_count / len(self.dls[phaseIndex]) / self.batch_size
        self.metrics[phaseName][0].append(epoch_loss)
        self.metrics[phaseName][1].append(epoch_acc)
        self.metrics[phaseName][2].append(f1Score)
        print(
            "\nepoch {} info: loss:{}, acc:{}, f1Score:{}".format(
                phaseName, epoch_loss, epoch_acc, f1Score
            )
        )
        return allPreds, allLabels

    def topKLoss(self, phaseIndex, k):
        lossValues = []
        # we use tqdm to provide visual feedback on training stage
        with torch.no_grad():
            for xb, yb in tqdm(self.dls[phaseIndex], total=len(self.dls[phaseIndex])):
                if self.isTransformer:
                    inputIds, mask = xb
                    yb = yb.to(self.device)
                    outputs = self.model(
                        inputIds.to(self.device),
                        attention_mask=mask.to(self.device),
                        labels=yb,
                    )
                    output = outputs[1]
                    inputIds.detach().cpu()
                    mask.detach().cpu()
                    yb.detach().cpu()
                else:
                    xb = xb.to(self.device)  # BATCH_SIZE, 3, 224, 224
                    yb = yb.to(self.device)  # BATCH_SIZE, 1
                    output = self.model(xb)  # BATCH_SIZE, 3
                    xb.detach().cpu()
                    yb.detach().cpu()
                lossValues.append(cross_entropy(
                    output, yb, reduction='none').cpu())
        lossValues = torch.cat(lossValues)
        return torch.topk(lossValues, k=k)

    def one_cycle(self):
        # self.freeze()
        for i in range(self.epochs):
            print("epoch number: {}".format(i))
            self.model.train()
            self.anEpoch(0)
            with torch.no_grad():
                self.model.eval()
                self.anEpoch(1)
            self.scheduler.step()
            self._save_weights()
        self.load_weights(self.model_name + ".pkl")
        if len(self.dls) > 2 and len(self.dls[2]) > 0:
            with torch.no_grad():
                self.model.eval()
                self.anEpoch(2)
        metrics = {}
        for i in range(3):
            metrics.update(self.getMetrics(i))
        self._write_hp(metrics)  # for comparing between experiments

    def freeze(self, toTrain=False):
        if self.isTransformer:
            for param in self.model.base_model.parameters():
                param.requires_grad = toTrain
            return
        for p in self.model.embedding.parameters():
            p.requires_grad = toTrain
        for p in self.model.lstm.parameters():
            p.requires_grad = toTrain

    def getMetrics(self, type):
        phases = ["train", "val", "test"]
        phase = phases[type]
        phaseMetrics = self.metrics[phases[type]]
        metricValues = [min(phaseMetrics[0]), max(
            phaseMetrics[1]), max(phaseMetrics[2])]
        metrics = {}
        for i, metricName in enumerate(["loss", "acc", "f1score"]):
            metricName = f"{phase}_{metricName}"
            metrics[metricName] = metricValues[i]
        return metrics

    def _log(self, phase, value, i):
        self.writer.add_scalar(tag=phase, scalar_value=value, global_step=i)

    def _write_hp(self, metrics):
        self.writer.add_hparams(self.hp, metrics)

    def setLR(self, lr):
        self.opt.param_groups[0]['lr'] = lr

    def load_weights(self, pkl_name, num_classes=None, family=None):
        weights_path = os.path.join(WEIGHTS_DIR, self.exp_name, pkl_name)
        sd = torch.load(weights_path)
        self.model.load_state_dict(sd, strict=False)
        self.model.to(self.device)

    def _save_weights(self):
        bestF1Score = max(self.metrics["val"][-1])
        if self.metrics["val"][-1][-1] == bestF1Score:
            weights_path = os.path.join(
                WEIGHTS_DIR, self.exp_name, self.model_name + ".pkl"
            )
            os.makedirs(os.path.join(
                WEIGHTS_DIR, self.exp_name), exist_ok=True)
            self.model.cpu()
            state = self.model.state_dict()
            torch.save(state, weights_path)  # open(pkl), compress
            self.model.to(self.device)

    def getPreds(self, phaseIdx, toSave=False):
        with torch.no_grad():
            preds, _ = self.anEpoch(phaseIdx, toLog=False)
        if not toSave:
            return preds
        dfCopy = self.dls[phaseIdx].dataset.getDF()
        if len(preds) < len(dfCopy):
            extra = len(dfCopy) - len(preds)
            preds = torch.cat([preds, torch.tensor([-1] * extra)])
        predCategories = list(map(lambda l: CATEGORY_SUBSET[l], preds.numpy()))
        dfCopy[PRED_COL] = predCategories
        dfCopy["correct"] = dfCopy[PRED_COL] == dfCopy[Y_COL]
        csvPath = os.path.join(
            PREDS_DIR, f"{self.model_name}_{PHASES[phaseIdx]}_preds.csv")
        dfCopy.to_csv(csvPath, index=False)
        return preds
Exemplo n.º 25
0
class D3RLPyLogger:

    _experiment_name: str
    _logdir: str
    _save_metrics: bool
    _verbose: bool
    _metrics_buffer: Dict[str, List[float]]
    _params: Optional[Dict[str, float]]
    _writer: Optional[SummaryWriter]

    def __init__(
        self,
        experiment_name: str,
        save_metrics: bool = True,
        root_dir: str = "logs",
        verbose: bool = True,
        tensorboard: bool = True,
        with_timestamp: bool = True,
    ):
        self._save_metrics = save_metrics
        self._verbose = verbose

        # add timestamp to prevent unintentional overwrites
        while True:
            if with_timestamp:
                date = datetime.now().strftime("%Y%m%d%H%M%S")
                self._experiment_name = experiment_name + "_" + date
            else:
                self._experiment_name = experiment_name

            if self._save_metrics:
                self._logdir = os.path.join(root_dir, self._experiment_name)
                if not os.path.exists(self._logdir):
                    os.makedirs(self._logdir)
                    break
                if with_timestamp:
                    time.sleep(1.0)
                else:
                    raise ValueError("%s already exists." % self._logdir)
            else:
                break

        self._metrics_buffer = {}

        if tensorboard:
            tfboard_path = os.path.join("runs", self._experiment_name)
            self._writer = SummaryWriter(logdir=tfboard_path)
        else:
            self._writer = None

        self._params = None

    def add_params(self, params: Dict[str, Any]) -> None:
        assert self._params is None, "add_params can be called only once."

        if self._save_metrics:
            # save dictionary as json file
            with open(os.path.join(self._logdir, "params.json"), "w") as f:
                json_str = json.dumps(params,
                                      default=default_json_encoder,
                                      indent=2)
                f.write(json_str)

        if self._verbose:
            for key, val in params.items():
                print("{}={}".format(key, val))

        # remove non-scaler values for HParams
        self._params = {k: v for k, v in params.items() if np.isscalar(v)}

    def add_metric(self, name: str, value: float) -> None:
        if name not in self._metrics_buffer:
            self._metrics_buffer[name] = []
        self._metrics_buffer[name].append(value)

    def commit(self, epoch: int, step: int) -> None:
        metrics = {}
        for name, buffer in self._metrics_buffer.items():
            metric = sum(buffer) / len(buffer)

            if self._save_metrics:
                with open(os.path.join(self._logdir, name + ".csv"), "a") as f:
                    print("%d,%d,%f" % (epoch, step, metric), file=f)

            if self._verbose:
                print("epoch=%d step=%d %s=%f" % (epoch, step, name, metric))

            if self._writer:
                self._writer.add_scalar("metrics/" + name, metric, epoch)

            metrics[name] = metric

        if self._params and self._writer:
            self._writer.add_hparams(
                self._params,
                metrics,
                name=self._experiment_name,
                global_step=epoch,
            )

        # initialize metrics buffer
        self._metrics_buffer = {}

    def save_model(self, epoch: int, algo: _SaveProtocol) -> None:
        if self._save_metrics:
            # save entire model
            model_path = os.path.join(self._logdir, "model_%d.pt" % epoch)
            algo.save_model(model_path)

    @contextmanager
    def measure_time(self, name: str) -> Iterator[None]:
        name = "time_" + name
        start = time.time()
        try:
            yield
        finally:
            self.add_metric(name, time.time() - start)

    @property
    def logdir(self) -> str:
        return self._logdir

    @property
    def experiment_name(self) -> str:
        return self._experiment_name
Exemplo n.º 26
0
class Writer:
    _STDOUT = sys.stdout
    _STDERR = sys.stderr

    def __init__(self, logdir, make_subdir, tag_group):
        if make_subdir:
            os.makedirs(logdir, exist_ok=True)

            timestamp = f"{datetime.datetime.now().strftime('%b%d_%H-%M-%S')}"
            logdir = os.path.join(logdir, timestamp)

        self._writer = SummaryWriter(logdir=logdir)

        assert logdir == self._writer.logdir
        self._logdir = logdir

        self._tag_group = tag_group

        sys.stdout = Tee(primary_file=self._STDOUT,
                         secondary_file=open(os.path.join(logdir, "stdout"),
                                             "a"))

        sys.stderr = Tee(primary_file=self._STDERR,
                         secondary_file=open(os.path.join(logdir, "stderr"),
                                             "a"))

    def write_scalar(self, tag, scalar_value, global_step=None):
        self._writer.add_scalar(self._tag(tag),
                                scalar_value,
                                global_step=global_step)

    def write_image(self, tag, img_tensor, global_step=None):
        self._writer.add_image(self._tag(tag),
                               img_tensor,
                               global_step=global_step)

    def write_figure(self, tag, figure, global_step=None):
        self._writer.add_figure(self._tag(tag),
                                figure,
                                global_step=global_step)

    def write_hparams(self, hparam_dict=None, metric_dict=None):
        self._writer.add_hparams(hparam_dict=hparam_dict,
                                 metric_dict=metric_dict)

    def write_json(self, tag, data):
        text = json.dumps(data, indent=4)

        self._writer.add_text(
            self._tag(tag),
            4 * " " +
            text.replace("\n", "\n" +
                         4 * " ")  # Indent by 4 to ensure codeblock formatting
        )

        json_path = os.path.join(self._logdir, f"{tag}.json")

        with open(json_path, "w") as f:
            f.write(text)

    def write_textfile(self, tag, text):
        path = os.path.join(self._logdir, f"{tag}.txt")
        with open(path, "w") as f:
            f.write(text)

    def write_checkpoint(self, tag, data):
        os.makedirs(self._checkpoints_dir, exist_ok=True)
        checkpoint_path = self._checkpoint_path(tag)

        tmp_checkpoint_path = os.path.join(
            os.path.dirname(checkpoint_path),
            f"{os.path.basename(checkpoint_path)}.tmp")

        torch.save(data, tmp_checkpoint_path)
        # replace is atomic, so we guarantee our checkpoints are always good
        os.replace(tmp_checkpoint_path, checkpoint_path)

    def load_checkpoint(self, tag, device):
        return torch.load(self._checkpoint_path(tag), map_location=device)

    def _checkpoint_path(self, tag):
        return os.path.join(self._checkpoints_dir, f"{tag}.pt")

    @property
    def _checkpoints_dir(self):
        return os.path.join(self._logdir, "checkpoints")

    def _tag(self, tag):
        return f"{self._tag_group}/{tag}"
Exemplo n.º 27
0
class Logger():
    def __init__(self, logdir, logname):
        self.logdir = logdir

        assert (os.path.isdir(logdir))
        self.dir = os.path.join(logdir, logname)
        os.mkdir(self.dir)

        self.tensorboard_dir = os.path.join(self.dir, "tensorboard")
        os.mkdir(self.tensorboard_dir)
        self.tensorboard_writer = SummaryWriter(self.tensorboard_dir)

        self.params = dict()
        self.plots = dict()
        self.plots_columns = dict()

    def update_params(self, params):
        self.params.update(params)

    def add_plot(self, name, columns):
        assert name not in self.plots
        self.plots[name] = list()
        self.plots_columns[name] = columns

    def add_plot_point(self, name, point):
        self.plots[name].append(point)

    def get_plot(self, name):
        return self.plots[name]

    def save_logs(self):
        self.save_csv()
        # Yet not use it
        # self.save_tensorboard()

    def save_model(self, model, name):
        models_path = os.path.join(self.dir, "models")
        os.makedirs(models_path, exist_ok=True)
        torch.save(model, os.path.join(models_path, name))

    def save_csv(self):
        plot_path = os.path.join(self.dir, "plots")
        os.makedirs(plot_path, exist_ok=True)
        for plot_name, plot_data in self.plots.items():
            filename = os.path.join(plot_path, plot_name + ".csv")
            pd.DataFrame(plot_data,
                         columns=self.plots_columns[plot_name]).to_csv(
                             filename, index=False)

        params_path = os.path.join(self.dir, "params.csv")
        pd.DataFrame(self.params.items(),
                     columns=("name", "value")).to_csv(params_path,
                                                       index=False)

    def save_tensorboard(self):
        self.tensorboard_writer.add_hparams(self.params, {})
        for plot_name, plot_data in self.plots.items():
            for i in range(len(plot_data)):
                # skip barriers
                #  if plot_data[i] in Barrier.values():
                #  continue

                # TODO fix ugly ifs
                if isinstance(plot_data[i], tuple):
                    self.tensorboard_writer.add_scalar(plot_name,
                                                       plot_data[i][2], i)
                else:
                    self.tensorboard_writer.add_scalar(plot_name, plot_data[i],
                                                       i)
Exemplo n.º 28
0
def main():
    global net
    global test_loader
    global scatter
    parser = argparse.ArgumentParser()
    # generic params
    parser.add_argument(
        "--name",
        default=datetime.now().strftime("%Y-%m-%d_%H:%M:%S"),
        help="Name to store the log file as",
    )
    parser.add_argument("--resume", help="Path to log file to resume from")

    parser.add_argument("--encoder", default="FSEncoder", help="Encoder")
    parser.add_argument("--decoder", default="DSPN", help="Decoder")
    parser.add_argument("--epochs",
                        type=int,
                        default=10,
                        help="Number of epochs to train with")
    parser.add_argument("--latent",
                        type=int,
                        default=32,
                        help="Dimensionality of latent space")
    parser.add_argument("--dim",
                        type=int,
                        default=64,
                        help="Dimensionality of hidden layers")
    parser.add_argument("--lr",
                        type=float,
                        default=1e-2,
                        help="Outer learning rate of model")
    parser.add_argument("--batch-size",
                        type=int,
                        default=12,
                        help="Batch size to train with")
    parser.add_argument("--num-workers",
                        type=int,
                        default=0,
                        help="Number of threads for data loader")
    parser.add_argument(
        "--dataset",
        choices=[
            "mnist", "clevr-box", "clevr-state", "cats", "merged", "wflw"
        ],
        help="Which dataset to use",
    )
    parser.add_argument(
        "--no-cuda",
        action="store_true",
        help="Run on CPU instead of GPU (not recommended)",
    )
    parser.add_argument("--train-only",
                        action="store_true",
                        help="Only run training, no evaluation")
    parser.add_argument("--eval-only",
                        action="store_true",
                        help="Only run evaluation, no training")
    parser.add_argument("--multi-gpu",
                        action="store_true",
                        help="Use multiple GPUs")
    parser.add_argument("--show",
                        action="store_true",
                        help="Plot generated samples in Tensorboard")
    parser.add_argument(
        "--show-skip",
        type=int,
        default=1,
        help="Number of epochs to skip before exporting to Tensorboard")

    parser.add_argument(
        "--infer-name",
        action="store_true",
        help="Automatically name run based on dataset/run number")

    parser.add_argument("--supervised", action="store_true", help="")
    parser.add_argument("--baseline",
                        action="store_true",
                        help="Use baseline model")

    parser.add_argument("--export-dir",
                        type=str,
                        help="Directory to output samples to")
    parser.add_argument("--export-n",
                        type=int,
                        default=10**9,
                        help="How many samples to output")
    parser.add_argument(
        "--export-progress",
        action="store_true",
        help="Output intermediate set predictions for DSPN?",
    )
    parser.add_argument(
        "--full-eval",
        action="store_true",
        help="Use full evaluation set (default: 1/10 of evaluation data)",
        # don't need full evaluation when training to save some time
    )
    parser.add_argument(
        "--mask-feature",
        action="store_true",
        help="Treat mask as a feature to compute loss with",
    )
    parser.add_argument(
        "--inner-lr",
        type=float,
        default=800,
        help="Learning rate of DSPN inner optimisation",
    )
    parser.add_argument(
        "--iters",
        type=int,
        default=10,
        help="How many DSPN inner optimisation iteration to take",
    )
    parser.add_argument(
        "--huber-repr",
        type=float,
        default=1,
        help="Scaling of repr loss term for DSPN supervised learning",
    )
    parser.add_argument(
        "--loss",
        choices=["hungarian", "chamfer", "emd"],
        default="emd",
        help="Type of loss used",
    )
    parser.add_argument(
        "--export-csv",
        action="store_true",
        help="Only perform predictions, don't evaluate in any way")
    parser.add_argument("--eval-split", help="Overwrite split on test set")

    args = parser.parse_args()

    if args.infer_name:
        if args.baseline:
            prefix = "base"
        else:
            prefix = "dspn"

        used_nums = []

        if not os.path.exists("runs"):
            os.makedirs("runs")

        runs = os.listdir("runs")
        for run in runs:
            if args.dataset in run:
                used_nums.append(int(run.split("-")[-1]))

        num = 1
        while num in used_nums:
            num += 1
        name = f"{prefix}-{args.dataset}-{num}"
    else:
        name = args.name

    print(f"Saving run to runs/{name}")
    train_writer = SummaryWriter(f"runs/{name}", purge_step=0)

    net = model.build_net(args)

    if not args.no_cuda:
        net = net.cuda()

    if args.multi_gpu:
        net = torch.nn.DataParallel(net)

    optimizer = torch.optim.Adam(
        [p for p in net.parameters() if p.requires_grad], lr=args.lr)

    print("Building dataloader")
    if args.dataset == "mnist":
        dataset_train = data.MNISTSet(train=True, full=args.full_eval)
        dataset_test = data.MNISTSet(train=False, full=args.full_eval)
    elif args.dataset in ["clevr-box", "clevr-state"]:
        dataset_train = data.CLEVR("clevr",
                                   "train",
                                   box=args.dataset == "clevr-box",
                                   full=args.full_eval)

        dataset_test = data.CLEVR("clevr",
                                  "val",
                                  box=args.dataset == "clevr-box",
                                  full=args.full_eval)
    elif args.dataset == "cats":
        dataset_train = data.Cats("cats", "train", 9, full=args.full_eval)
        dataset_test = data.Cats("cats", "val", 9, full=args.full_eval)
    elif args.dataset == "faces":
        dataset_train = data.Faces("faces", "train", 4, full=args.full_eval)
        dataset_test = data.Faces("faces", "val", 4, full=args.full_eval)
    elif args.dataset == "wflw":
        if args.eval_split:
            eval_split = f"test_{args.eval_split}"
        else:
            eval_split = "test"

        dataset_train = data.WFLW("wflw", "train", 7, full=args.full_eval)
        dataset_test = data.WFLW("wflw", eval_split, 7, full=args.full_eval)
    elif args.dataset == "merged":
        # merged cats and human faces
        dataset_train_cats = data.Cats("cats", "train", 9, full=args.full_eval)
        dataset_train_wflw = data.WFLW("wflw", "train", 9, full=args.full_eval)

        dataset_test_cats = data.Cats("cats", "val", 9, full=args.full_eval)
        dataset_test_wflw = data.WFLW("wflw", "test", 9, full=args.full_eval)

        dataset_train = data.MergedDataset(dataset_train_cats,
                                           dataset_train_wflw)

        dataset_test = data.MergedDataset(dataset_test_cats, dataset_test_wflw)

    if not args.eval_only:
        train_loader = data.get_loader(dataset_train,
                                       batch_size=args.batch_size,
                                       num_workers=args.num_workers)

    if not args.train_only:
        test_loader = data.get_loader(dataset_test,
                                      batch_size=args.batch_size,
                                      num_workers=args.num_workers,
                                      shuffle=False)

    tracker = track.Tracker(
        train_mae=track.ExpMean(),
        train_last=track.ExpMean(),
        train_loss=track.ExpMean(),
        test_mae=track.Mean(),
        test_last=track.Mean(),
        test_loss=track.Mean(),
    )

    if args.resume:
        log = torch.load(args.resume)
        weights = log["weights"]
        n = net
        if args.multi_gpu:
            n = n.module
        n.load_state_dict(weights, strict=True)

    if args.export_csv:
        names = []
        predictions = []
        export_targets = []

    def run(net, loader, optimizer, train=False, epoch=0, pool=None):
        writer = train_writer
        if train:
            net.train()
            prefix = "train"
            torch.set_grad_enabled(True)
        else:
            net.eval()
            prefix = "test"
            torch.set_grad_enabled(False)

        if args.export_dir:
            true_export = []
            pred_export = []

        iters_per_epoch = len(loader)
        loader = tqdm(
            loader,
            ncols=0,
            desc="{1} E{0:02d}".format(epoch, "train" if train else "test "),
        )

        for i, sample in enumerate(loader, start=epoch * iters_per_epoch):
            # input is either a set or an image
            input, target_set, target_mask = map(lambda x: x.cuda(), sample)

            # forward evaluation through the network
            (progress, masks, evals,
             gradn), (y_enc, y_label) = net(input, target_set, target_mask)

            progress_only = progress

            # if using mask as feature, concat mask feature into progress
            if args.mask_feature:
                target_set = torch.cat(
                    [target_set, target_mask.unsqueeze(dim=1)], dim=1)
                progress = [
                    torch.cat([p, m.unsqueeze(dim=1)], dim=1)
                    for p, m in zip(progress, masks)
                ]

            if args.loss == "chamfer":
                # dim 0 is over the inner iteration steps
                # target set is broadcasted over dim 0
                set_loss = utils.chamfer_loss(torch.stack(progress),
                                              target_set.unsqueeze(0))
            elif args.loss == "hungarian":
                set_loss = utils.hungarian_loss(progress[-1],
                                                target_set,
                                                thread_pool=pool).unsqueeze(0)
            elif args.loss == "emd":
                set_loss = utils.emd(progress[-1], target_set).unsqueeze(0)

            # Only use representation loss with DSPN and when doing general
            # supervised prediction, not when auto-encoding
            if args.supervised and not args.baseline:
                repr_loss = args.huber_repr * F.smooth_l1_loss(y_enc, y_label)
                loss = set_loss.mean() + repr_loss.mean()
            else:
                loss = set_loss.mean()

            # restore progress variable to not contain masks for correct
            # exporting
            progress = progress_only

            # Outer optim step
            if train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            # Tensorboard tracking of metrics for debugging
            tracked_last = tracker.update(f"{prefix}_last",
                                          set_loss[-1].item())
            tracked_loss = tracker.update(f"{prefix}_loss", loss.item())
            if train:
                writer.add_scalar("metric/set-loss",
                                  loss.item(),
                                  global_step=i)

                writer.add_scalar("metric/set-last",
                                  set_loss[-1].mean().item(),
                                  global_step=i)

                if not args.baseline:
                    writer.add_scalar("metric/eval-first",
                                      evals[0].mean().item(),
                                      global_step=i)

                    writer.add_scalar("metric/eval-last",
                                      evals[-1].mean().item(),
                                      global_step=i)

                    writer.add_scalar("metric/max-inner-grad-norm",
                                      max(g.item() for g in gradn),
                                      global_step=i)

                    writer.add_scalar("metric/mean-inner-grad-norm",
                                      sum(g.item()
                                          for g in gradn) / len(gradn),
                                      global_step=i)

                    if args.supervised:
                        writer.add_scalar("metric/repr_loss",
                                          repr_loss.item(),
                                          global_step=i)

            # Print current progress to progress bar
            fmt = "{:.6f}".format
            loader.set_postfix(last=fmt(tracked_last),
                               loss=fmt(tracked_loss),
                               bad=fmt(evals[-1].detach().cpu().item() *
                                       1000) if not args.baseline else 0)

            if args.export_dir:
                # export last inner optim of each input as csv
                # (one input per row)
                if args.export_csv:
                    # the second to last element are the last of the
                    # inner optim
                    for batch_i, p in enumerate(progress[-2]):
                        img_id = i * args.batch_size + batch_i

                        names.append(loader.iterable.dataset.get_fname(img_id))

                        m = masks[-2][batch_i]
                        m = m.cpu().detach().numpy().astype(bool)

                        p = p.cpu().detach().numpy()
                        p = p[:, m]

                        sample_preds = [
                            p[k % 2, k // 2] for k in range(p.shape[1] * 2)
                        ]
                        # remove values according to mask and add zeros to the
                        # end in stead
                        sample_preds += [0] * (len(m) * 2 - len(sample_preds))
                        predictions.append(sample_preds)

                        true_mask = target_set[batch_i, 2, :].cpu().detach()
                        true_mask = true_mask.numpy().astype(bool)
                        trues = target_set[batch_i, :2, :]
                        trues = trues.cpu().detach().numpy()

                        t = trues[:, true_mask]

                        t = [t[k % 2, k // 2] for k in range(t.shape[1] * 2)]

                        t += [0] * (len(true_mask) * 2 - len(t))

                        export_targets.append(t)

                # Store predictions to be exported
                else:
                    if len(true_export) < args.export_n:
                        for p, m in zip(target_set, target_mask):
                            true_export.append(p.detach().cpu())
                        progress_steps = []
                        for pro, ms in zip(progress, masks):
                            # pro and ms are one step of the inner optim
                            # score boxes contains the list of predicted
                            # elements for one step
                            score_boxes = []
                            for p, m in zip(pro.cpu().detach(),
                                            ms.cpu().detach()):
                                score_box = torch.cat([m.unsqueeze(0), p],
                                                      dim=0)
                                score_boxes.append(score_box)
                            progress_steps.append(score_boxes)
                        for b in zip(*progress_steps):
                            pred_export.append(b)

            # Plot predictions in Tensorboard
            if args.show and epoch % args.show_skip == 0 and not train:
                name = f"set/epoch-{epoch}/img-{i}"
                # thresholded set
                progress.append(progress[-1])
                masks.append((masks[-1] > 0.5).float())
                # target set
                if args.mask_feature:
                    # target set is augmented with masks, so remove them
                    progress.append(target_set[:, :-1])
                else:
                    progress.append(target_set)
                masks.append(target_mask)
                # intermediate sets

                for j, (s, ms) in enumerate(zip(progress, masks)):
                    if args.dataset == "clevr-state":
                        continue

                    if args.dataset.startswith("clevr"):
                        threshold = 0.5
                    else:
                        threshold = None

                    s, ms = utils.scatter_masked(
                        s,
                        ms,
                        binned=args.dataset.startswith("clevr"),
                        threshold=threshold)

                    if j != len(progress) - 1:
                        tag_name = f"{name}"
                    else:
                        tag_name = f"{name}-target"

                    if args.dataset == "clevr-box":
                        img = input[0].detach().cpu()

                        writer.add_image_with_boxes(tag_name,
                                                    img,
                                                    s.transpose(0, 1),
                                                    global_step=j)
                    elif args.dataset == "cats" \
                            or args.dataset == "wflw" \
                            or args.dataset == "merged":

                        img = input[0].detach().cpu()

                        fig = plt.figure()
                        plt.scatter(s[0, :] * 128, s[1, :] * 128)

                        plt.imshow(np.transpose(img, (1, 2, 0)))

                        writer.add_figure(tag_name, fig, global_step=j)
                    else:  # mnist
                        fig = plt.figure()
                        y, x = s
                        y = 1 - y
                        ms = ms.numpy()
                        rgba_colors = np.zeros((ms.size, 4))
                        rgba_colors[:, 2] = 1.0
                        rgba_colors[:, 3] = ms
                        plt.scatter(x, y, color=rgba_colors)
                        plt.axes().set_aspect("equal")
                        plt.xlim(0, 1)
                        plt.ylim(0, 1)
                        writer.add_figure(tag_name, fig, global_step=j)

        # Export predictions
        if args.export_dir and not args.export_csv:
            os.makedirs(f"{args.export_dir}/groundtruths", exist_ok=True)
            os.makedirs(f"{args.export_dir}/detections", exist_ok=True)
            for i, (gt, dets) in enumerate(zip(true_export, pred_export)):
                export_groundtruths_path = os.path.join(
                    args.export_dir, "groundtruths", f"{i}.txt")

                with open(export_groundtruths_path, "w") as fd:
                    for box in gt.transpose(0, 1):
                        if (box == 0).all():
                            continue
                        s = "box " + " ".join(map(str, box.tolist()))
                        fd.write(s + "\n")

                if args.export_progress:
                    for step, det in enumerate(dets):
                        export_progress_path = os.path.join(
                            args.export_dir, "detections",
                            f"{i}-step{step}.txt")

                        with open(export_progress_path, "w") as fd:
                            for sbox in det.transpose(0, 1):
                                s = f"box " + " ".join(map(str, sbox.tolist()))
                                fd.write(s + "\n")

                export_path = os.path.join(args.export_dir, "detections",
                                           f"{i}.txt")
                with open(export_path, "w") as fd:
                    for sbox in dets[-1].transpose(0, 1):
                        s = f"box " + " ".join(map(str, sbox.tolist()))
                        fd.write(s + "\n")

    import subprocess

    git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"])
    # git_hash = "483igtrfiuey46"

    torch.backends.cudnn.benchmark = True

    metrics = {}

    start = time.time()

    if args.eval_only:
        tracker.new_epoch()
        with mp.Pool(10) as pool:
            run(net, test_loader, optimizer, train=False, epoch=0, pool=pool)

        metrics["test_loss"] = np.mean(tracker.data["test_loss"][-1])
        metrics["test_set_loss"] = np.mean(tracker.data["test_last"][-1])
    else:
        best_test_loss = float("inf")

        for epoch in range(args.epochs):
            tracker.new_epoch()
            with mp.Pool(10) as pool:
                run(net,
                    train_loader,
                    optimizer,
                    train=True,
                    epoch=epoch,
                    pool=pool)
                if not args.train_only:
                    run(net,
                        test_loader,
                        optimizer,
                        train=False,
                        epoch=epoch,
                        pool=pool)

            epoch_test_loss = np.mean(tracker.data["test_loss"][-1])

            if epoch_test_loss < best_test_loss:
                print("new best loss")
                best_test_loss = epoch_test_loss
                # only save if the epoch has lower loss
                metrics["test_loss"] = epoch_test_loss
                metrics["train_loss"] = np.mean(tracker.data["train_loss"][-1])

                metrics["train_set_loss"] = np.mean(
                    tracker.data["train_last"][-1])
                metrics["test_set_loss"] = np.mean(
                    tracker.data["test_last"][-1])

                metrics["best_epoch"] = epoch

                results = {
                    "name":
                    name + "-best",
                    "tracker":
                    tracker.data,
                    "weights":
                    net.state_dict()
                    if not args.multi_gpu else net.module.state_dict(),
                    "args":
                    vars(args),
                    "hash":
                    git_hash,
                }

                torch.save(results, os.path.join("logs", name + "-best"))

        results = {
            "name":
            name + "-final",
            "tracker":
            tracker.data,
            "weights":
            net.state_dict()
            if not args.multi_gpu else net.module.state_dict(),
            "args":
            vars(args),
            "hash":
            git_hash,
        }
        torch.save(results, os.path.join("logs", name + "-final"))

    if args.export_csv and args.export_dir:
        path = os.path.join(args.export_dir, f'{args.name}-predictions.csv')
        pd.DataFrame(np.array(predictions), index=names).to_csv(path,
                                                                sep=',',
                                                                index=names,
                                                                header=False)

        path = os.path.join(args.export_dir, f'{args.name}-targets.csv')
        pd.DataFrame(np.array(export_targets),
                     index=names).to_csv(path,
                                         sep=',',
                                         index=names,
                                         header=False)

    took = time.time() - start
    print(f"Process took {took:.1f}s, avg {took/args.epochs:.1f} s/epoch.")

    # save hyper parameters to tensorboard for reference
    hparams = {k: v for k, v in vars(args).items() if v is not None}

    print(metrics)
    metrics = {"total_time": took, "avg_time_per_epoch": took / args.epochs}

    print("writing hparams")
    train_writer.add_hparams(hparams, metric_dict=metrics, name="hparams")
Exemplo n.º 29
0
def run_exp(first_n, lr, weight_decay, cross_ent_weight, batch_size,
            np_th_seed, debug, n_epochs, n_mixes, output_dir,
            scale_2_cross_ent, mask_for_cross_ent, nll_weight,
            linear_classifier, flow_gmm, flow_coupling):
    hparams = {k: v for k, v in locals().items() if v is not None}
    noise_factor = 1 / 256.0
    if debug:
        first_n = 512
        batch_size = 10
        n_epochs = 5
    set_random_seeds(np_th_seed, True)

    writer = SummaryWriter(output_dir)
    writer.add_hparams(hparams, metric_dict={}, name=output_dir)
    writer.flush()
    model = create_glow_model(hidden_channels=512,
                              K=32,
                              L=3,
                              flow_permutation='invconv',
                              flow_coupling=flow_coupling,
                              LU_decomposed=True,
                              n_chans=3,
                              block_type='conv',
                              use_act_norm=True)
    if flow_coupling == 'additive':
        state_dict = th.load(
            '/home/schirrmr/data/exps/invertible/additive/7/state_dicts_model_250.pth'
        )
    else:
        assert flow_coupling == 'affine'
        state_dict = th.load(
            '/home/schirrmr/data/exps/invertible/finetune//12/state_dicts_model_76.pth'
        )
    for key in state_dict.keys():
        if 'loc' in key or 'log_scale' in key:
            state_dict[key].squeeze_()

    model.load_state_dict(state_dict)
    del state_dict
    pre_dist_model = convert_glow_to_pre_dist_model(model, as_list=True)
    del model
    if flow_gmm:
        dist0 = NClassIndependentDist(10,
                                      n_dims=3072 // 2,
                                      optimize_mean=False,
                                      optimize_std=False)
        dist1 = NClassIndependentDist(10,
                                      n_dims=3072 // 4,
                                      optimize_mean=False,
                                      optimize_std=False)
        dist2 = NClassIndependentDist(10,
                                      n_dims=3072 // 4,
                                      optimize_mean=False,
                                      optimize_std=False)
        dist0.class_means.normal_(mean=0, std=1)
        dist1.class_means.normal_(mean=0, std=1)
        dist2.class_means.normal_(mean=0, std=1)
    else:
        init_dist_std = 1e-1
        dist0 = PerDimWeightedMix(10,
                                  n_mixes=n_mixes,
                                  n_dims=3072 // 2,
                                  optimize_mean=True,
                                  optimize_std=True,
                                  init_std=init_dist_std)
        dist1 = PerDimWeightedMix(10,
                                  n_mixes=n_mixes,
                                  n_dims=3072 // 4,
                                  optimize_mean=True,
                                  optimize_std=True,
                                  init_std=init_dist_std)
        dist2 = PerDimWeightedMix(10,
                                  n_mixes=n_mixes,
                                  n_dims=3072 // 4,
                                  optimize_mean=True,
                                  optimize_std=True,
                                  init_std=init_dist_std)
    model = Node(pre_dist_model, ApplyToList(dist0, dist1, dist2))
    net = model.cuda()
    init_all_modules(net, None)

    if mask_for_cross_ent:
        alphas_mask = th.zeros(768, requires_grad=True, device='cuda')
    if linear_classifier:
        clf = th.nn.Linear(768, 10).cuda()

    train_loader, valid_loader = load_train_test(
        'cifar10',
        shuffle_train=True,
        drop_last_train=True,
        batch_size=batch_size,
        eval_batch_size=256,
        n_workers=8,
        first_n=first_n,
        augment=True,
        exclude_cifar_from_tiny=False,
    )

    optim = th.optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)

    if mask_for_cross_ent:
        optim.add_param_group(
            dict(params=[alphas_mask], lr=5e-2, weight_decay=0))
    if linear_classifier:
        optim.add_param_group(
            dict(params=clf.parameters(), lr=lr, weight_decay=weight_decay))

    def get_lp_for_cross_ent(z, lp, net, scale_2_cross_ent,
                             mask_for_cross_ent):
        dists = list(net.module.module_list.children())
        if linear_classifier:
            lp_for_cross_ent = clf(z[2])
        else:
            if scale_2_cross_ent:
                lp_for_cross_ent = dists[2](z[2],
                                            fixed=dict(sum_dims=False))[1]
                if mask_for_cross_ent:
                    mask = th.sigmoid(alphas_mask)
                    lp_for_cross_ent = lp_for_cross_ent * mask.unsqueeze(
                        0).unsqueeze(0)
                lp_for_cross_ent = lp_for_cross_ent.sum(dim=-1)
            else:
                lp_for_cross_ent = lp
        return lp_for_cross_ent

    for i_epoch in range(n_epochs + 1):
        if i_epoch > 0:
            for X, y in train_loader:
                y = y.cuda()
                noise = th.rand_like(X) * 1 / 256.0
                noised = X + noise
                z, lp = net(noised.cuda(), fixed=dict(y=None))
                lp_for_cross_ent = get_lp_for_cross_ent(
                    z, lp, net, scale_2_cross_ent, mask_for_cross_ent)

                cross_ent = th.nn.functional.cross_entropy(
                    lp_for_cross_ent,
                    y.argmax(dim=1),
                )
                nll = -th.mean(th.sum(lp * y, dim=1))
                loss = cross_ent_weight * cross_ent + nll_weight * nll
                optim.zero_grad()
                loss.backward()
                optim.step()
                optim.zero_grad()
                del y, noise, noised, lp, cross_ent, nll, loss

        print(i_epoch)
        results = {}
        with th.no_grad():
            for name, loader in (('Train', train_loader), ('Valid',
                                                           valid_loader)):
                all_lps = []
                all_corrects = []

                for X, y in loader:
                    y = y.cuda()
                    # First with noise to get nll for bpd,
                    # then without noise for accâuracy
                    noise = th.rand_like(X) * 1 / 256.0
                    noised = X + noise
                    noise_log_prob = np.log(256) * np.prod(X.shape[1:])
                    z, lp = net(noised.cuda())
                    lps = to_numpy(th.sum(lp * y, dim=1) - noise_log_prob)
                    all_lps.extend(lps)
                    z, lp = net(X.cuda() + (1 / (2 * 256.0)))
                    lp_for_cross_ent = get_lp_for_cross_ent(
                        z, lp, net, scale_2_cross_ent, mask_for_cross_ent)
                    corrects = to_numpy(
                        y.argmax(dim=1) == lp_for_cross_ent.argmax(dim=1))
                    all_corrects.extend(corrects)
                acc = np.mean(all_corrects)
                nll = -(np.mean(all_lps) / (np.prod(X.shape[1:]) * np.log(2)))
                print(f"{name} NLL: {nll:.2f}")
                print(f"{name} Acc: {acc:.1%}")
                results[f"{name.lower()}_nll"] = nll
                results[f"{name.lower()}_acc"] = acc
                writer.add_scalar(f"{name.lower()}_nll", nll, i_epoch)
                writer.add_scalar(f"{name.lower()}_acc", acc * 100, i_epoch)
                del noise, noised, z, lp, lps
        writer.flush()
        sys.stdout.flush()
        if not debug:
            dict_path = os.path.join(output_dir, "model_dict.th")
            th.save(net.state_dict(), open(dict_path, 'wb'))
            if mask_for_cross_ent:
                mask_path = os.path.join(output_dir, "alphas_mask.th")
                th.save(alphas_mask, open(mask_path, 'wb'))
            model_path = os.path.join(output_dir, "model.th")
            th.save(net, open(model_path, 'wb'))

    return results
Exemplo n.º 30
0
def main():
    """ Run the experiment. """

    # TensorboardX
    tbx_writer = SummaryWriter(comment="pendulum_naive_AIRL")

    tbx_writer.add_hparams(vars(args), {})

    # env related
    env = gym.make("Pendulum-v0")

    feature_extractor = IdentityFeatureExtractor()

    state_size = feature_extractor.extract_features(env.reset()).shape[0]

    # rl related
    replay_buffer = ReplayBuffer(args.replay_buffer_length)

    rl = SoftActorCritic(
        env,
        replay_buffer,
        feature_extractor,
        args.replay_buffer_sample_size,
        entropy_target=args.entropy_target,
        entropy_tuning=args.disable_entropy_tuning,
        tau=args.tau,
        log_alpha=args.log_alpha,
        play_interval=args.play_interval,
        tbx_writer=tbx_writer,
        learning_rate=1e-3,
    )

    # irl related

    expert_policy = PolicyNetwork(state_size, env.action_space,
                                  NN_HIDDEN_WIDTH)
    expert_policy.load("../pendulum_policies/pendulum_expert.pt")
    expert = PolicyExpert(expert_policy, env, args.num_expert_trajs,
                          args.max_env_steps)

    expert_states = expert.get_expert_states()
    expert_actions = expert.get_expert_actions()

    irl = NaiveAIRL(
        rl,
        env,
        expert_states,
        expert_actions,
        tbx_writer=tbx_writer,
        learning_rate=1e-3,
    )

    irl.train(
        args.irl_episodes,
        args.irl_traj_per_ep,
        args.max_env_steps,
        args.irl_num_policy_updates,
    )

    import pdb

    pdb.set_trace()