예제 #1
0
    def run_training_batch(self, batch, batch_idx, dataloader_idx):
        # track grad norms
        grad_norm_dict = {}

        # bookkeeping
        self._hiddens = None

        optimizers = list(enumerate(self.trainer.optimizers))

        # track all outputs across time and num of optimizers
        batch_outputs = [[] for _ in range(len(optimizers))]

        if batch is None:
            self.warning_cache.warn(
                "train_dataloader yielded None. If this was on purpose, ignore this warning..."
            )
            return AttributeDict(
                signal=0,
                grad_norm_dict={},
                training_step_output_for_epoch_end=batch_outputs,
            )

        # hook
        response = self.trainer.call_hook("on_batch_start")
        if response == -1:
            return AttributeDict(signal=-1, grad_norm_dict={})

        # hook
        response = self.trainer.call_hook("on_train_batch_start", batch,
                                          batch_idx, dataloader_idx)
        if response == -1:
            return AttributeDict(signal=-1, grad_norm_dict={})

        # lightning module hook
        splits = self._tbptt_split_batch(batch)

        for split_idx, split_batch in enumerate(splits):
            self.split_idx = split_idx

            if self.trainer.lightning_module.automatic_optimization:
                for opt_idx, optimizer in self.get_active_optimizers(
                        batch_idx):
                    result = self._run_optimization(batch_idx, split_idx,
                                                    split_batch, opt_idx,
                                                    optimizer)
                    if result:
                        batch_outputs[opt_idx].append(
                            result.training_step_output_for_epoch_end)
                        grad_norm_dict = result.get("grad_norm_dict", {})
            else:
                # in manual optimization, there is no looping over optimizers
                result = self._run_optimization(batch_idx, split_idx,
                                                split_batch)
                if result:
                    batch_outputs[0].append(
                        result.training_step_output_for_epoch_end)

        output = AttributeDict(
            signal=0,
            # todo: Properly aggregate grad_norm accros opt_idx and split_idx
            grad_norm_dict=grad_norm_dict,
            training_step_output_for_epoch_end=batch_outputs,
        )
        return output
예제 #2
0
    def run_training_batch(self, batch, batch_idx, dataloader_idx):
        # track grad norms
        grad_norm_dic = {}

        # track all metrics for callbacks
        batch_callback_metrics = []

        # track metrics to log
        batch_log_metrics = []

        # bookkeeping
        using_results_obj = False
        self.trainer.hiddens = None

        # track all outputs across time and num of optimizers
        batch_outputs = [[] for _ in range(len(self.get_optimizers_iterable()))]

        if batch is None:
            return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic)

        # hook
        response = self.trainer.call_hook("on_batch_start")
        if response == -1:
            return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)

        # hook
        response = self.trainer.call_hook("on_train_batch_start", batch, batch_idx, dataloader_idx)
        if response == -1:
            return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)

        # checks if backward or backward + optimizer step (via closure)
        accumulation_done = self._accumulated_batches_reached()
        is_final_batch = self._num_training_batches_reached()
        should_accumulate = not (accumulation_done or is_final_batch)

        # lightning module hook
        splits = self.tbptt_split_batch(batch)

        for split_idx, split_batch in enumerate(splits):
            self.trainer.split_idx = split_idx

            # in manual optimization we loop over all optimizers at once
            optimizers = self.get_optimizers_iterable()
            if not self.automatic_optimization:
                optimizers = [optimizers[0]]

            # loop over optimizers
            for opt_idx, optimizer in optimizers:
                # make sure only the gradients of the current optimizer's parameters are calculated
                # in the training step to prevent dangling gradients in multiple-optimizer setup.
                if self.automatic_optimization and len(self.trainer.optimizers) > 1:
                    model = self.trainer.get_model()
                    model.toggle_optimizer(optimizer, opt_idx)

                if should_accumulate:
                    # For gradient accumulation

                    # -------------------
                    # calculate loss (train step + train step end)
                    # -------------------

                    # perform dpp sync only when performing optimizer_step
                    with self.block_ddp_sync_behaviour():
                        self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens)

                    batch_outputs = self._process_closure_result(
                        batch_callback_metrics=batch_callback_metrics,
                        batch_log_metrics=batch_log_metrics,
                        batch_outputs=batch_outputs,
                        opt_idx=opt_idx,
                    )

                # ------------------------------
                # BACKWARD PASS
                # ------------------------------
                # gradient update with accumulated gradients

                else:
                    if self.automatic_optimization:

                        def train_step_and_backward_closure():
                            result = self.training_step_and_backward(
                                split_batch,
                                batch_idx,
                                opt_idx,
                                optimizer,
                                self.trainer.hiddens
                            )
                            return None if result is None else result.loss

                        # optimizer step
                        self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)

                    else:
                        self._curr_step_result = self.training_step(split_batch, batch_idx, opt_idx, self.trainer.hiddens)

                    if self._curr_step_result is None:
                        # user decided to skip optimization
                        continue

                    batch_outputs = self._process_closure_result(
                        batch_callback_metrics=batch_callback_metrics,
                        batch_log_metrics=batch_log_metrics,
                        batch_outputs=batch_outputs,
                        opt_idx=opt_idx,
                    )

                    grad_norm_dic = self._cur_grad_norm_dict
                    self._cur_grad_norm_dict = None

                    # hook
                    self.on_before_zero_grad(optimizer)

                    # clear gradients
                    self.optimizer_zero_grad(batch_idx, optimizer, opt_idx)

                    accumulated_loss = self.accumulated_loss.mean()

                    if accumulated_loss is not None:
                        # calculate running loss for display
                        self.running_loss.append(self.accumulated_loss.mean() * self.trainer.accumulate_grad_batches)

                    # reset for next set of accumulated grads
                    self.accumulated_loss.reset()

        # collapse all metrics into one dict
        batch_log_metrics = {k: v for d in batch_log_metrics for k, v in d.items()}

        # track all metrics for callbacks
        self.trainer.logger_connector.callback_metrics.update(batch_log_metrics)
        self.trainer.logger_connector.callback_metrics.update(
            {k: v for d in batch_callback_metrics for k, v in d.items() if v is not None}
        )

        result = AttributeDict(
            signal=0,
            grad_norm_dic=grad_norm_dic,
            batch_log_metrics=batch_log_metrics,
            training_step_output_for_epoch_end=batch_outputs,
        )
        return result
예제 #3
0
def main():
    exp_parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    exp_parser.add_argument('--checkpoint-path', '-c', type=str, help='which checkpoint to load')
    exp_parser.add_argument('--batch-size', '-bs', type=int, help='batch size')
    exp_parser.add_argument('--resize', '-rs', nargs=2, type=int, help='resize')
    exp_parser.add_argument('--out-dir', '-od', type=str, help='output directory for nifti counterfactuals')
    exp_parser.add_argument('--csv', '-csv', type=str, help='csv path')
    exp_parser.add_argument('-v', '--verbosity', action="count", default=0,
                            help="increase output verbosity (e.g., -vv is more than -v)")

    exp_args, other_args = exp_parser.parse_known_args()
    if exp_args.verbosity == 1:
        level = logging.getLevelName('INFO')
    elif exp_args.verbosity >= 2:
        level = logging.getLevelName('DEBUG')
    else:
        level = logging.getLevelName('WARNING')
    logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=level)

    checkpoint_path = exp_args.checkpoint_path
    ckpt = torch.load(checkpoint_path, map_location=torch.device('cpu'))
    hparams = ckpt['hyper_parameters']
    logger.info(f'found hparams: {hparams}')
    exp_class = EXPERIMENT_REGISTRY[hparams['experiment']]

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser = Trainer.add_argparse_args(parser)
    parser.set_defaults(checkpoint_callback=True)
    parser._action_groups[1].title = 'lightning_options'
    args = parser.parse_args(other_args)

    if args.gpus is not None and isinstance(args.gpus, int):
        os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpus)
        args.gpus = 1

    exp = hparams['model']
    model_class = MODEL_REGISTRY[exp]
    model_params = {
        k: v for k, v in hparams.items() if (k in inspect.signature(model_class.__init__).parameters
                                             or k in k in inspect.signature(model_class.__bases__[0].__init__).parameters
                                             or k in k in inspect.signature(model_class.__bases__[0].__bases__[0].__init__).parameters)
    }
    model_params['img_shape'] = hparams['resize'] if 'resize' in hparams else exp_args.resize
    new_state_dict = OrderedDict()
    for key, value in ckpt['state_dict'].items():
        new_key = key.replace('pyro_model.', '')
        new_state_dict[new_key] = value
    loaded_model = model_class(**model_params)
    loaded_model.load_state_dict(new_state_dict)
    device = torch.device('cuda' if (torch.cuda.is_available() and args.gpus is not None) else 'cpu')
    for p in loaded_model._buffers.keys():
        if any([(b in p) for b in _buffers_to_load]):
            setattr(loaded_model, p, getattr(loaded_model, p).to(device))
    loaded_model.eval()
    loaded_model = loaded_model.to(device)

    groups = {}
    for group in parser._action_groups:
        group_dict = {a.dest: getattr(args, a.dest, None) for a in group._group_actions}
        groups[group.title] = argparse.Namespace(**group_dict)
    lightning_args = groups['lightning_options']
    trainer = Trainer.from_argparse_args(lightning_args)
    trainer.logger.experiment.log_dir = exp_args.checkpoint_path
    hparams = AttributeDict(hparams)
    hparams.test_dir = exp_args.out_dir
    hparams.test_csv = exp_args.csv
    hparams.test_batch_size = exp_args.batch_size
    hparams.test_batch_size = exp_args.batch_size
    experiment = exp_class.load_from_checkpoint(checkpoint_path, hparams=hparams, pyro_model=loaded_model)
    logger.info(f'Loaded {experiment.__class__}:\n{experiment}')
    trainer.test(experiment)
예제 #4
0
 def __init__(self, trainer):
     self.trainer = trainer
     self.dist = AttributeDict(rank=0, device=None)
예제 #5
0
                 tracking_uri=mlflow_url)
])

# Make trainer
trainer = pl.Trainer.from_argparse_args(arguments, logger=logger)

# Make data model factory
if arguments.frames is not None:
    frames = arguments.frames.split(",")
    frames = [int(x) for x in frames]
    frames = range(*frames)
else:
    frames = None
data_model_factory = KittiDataModuleFactory(frames, arguments.sequences,
                                            arguments.dataset)

# Load parameters
params = load_hparams_from_yaml(arguments.config)
params = AttributeDict(params)
print("Load model from params \n" + str(params))
data_model = data_model_factory.make_data_module_from_params(params)
model = MultiUnsupervisedDepthModelFactory().make_model(
    params, data_model.get_cameras_calibration())

if arguments.load_model:
    print("Load checkpoint")
    load_undeepvo_checkpoint(model, arguments.model_checkpoint)

print("Start training")
trainer.fit(model, data_model)
예제 #6
0
    def load(
        path: Union[Path, str],
        old_args=AttributeDict(),
        Cls: Type[RideModule] = None,
        auto_scale_lr=False,
    ) -> AttributeDict:
        """Loads hparams from path

        Args:
            path (Union[Path, str]): Path to jsonfile containing hparams
            old_args (Optional[AttributeDict]):The AttributeDict to be updated with the new hparams
            cls (Optional[RideModule]): A class whole hyperparameters can be used to select the relevant hparams to take

        Returns:
            AttributeDict: AttributeDict with updated hyperparameters
        """
        path = Path(path)
        hparams = load_structured_data(path)

        if Cls:
            hparam_names = Cls.configs().names
            hparams = {k: v for k, v in hparams.items() if k in hparam_names}

        # During hparamsearch, only a single GPU is used, but accumulate_grad_batches is set to the total number of gpus given
        # If we have multiple GPUs, we need to reduce accumulate_grad_batches accordingly
        num_gpus = parse_num_gpus(old_args.gpus)
        if num_gpus > 0 and "accumulate_grad_batches" in hparams:  # pragma: no cover
            hparams["accumulate_grad_batches"] = max(
                1, int(hparams["accumulate_grad_batches"]) // num_gpus
            )

        old_args = dict(old_args)
        user_passed_arg_keys = [a[2:] for a in sys.argv if a.startswith("--")]
        user_passed_args = {
            k: v for k, v in old_args.items() if k in user_passed_arg_keys
        }

        # If batch size was changed by user, automatically apply the linear scaling rule to the learning rate
        if (
            auto_scale_lr
            and "batch_size" in hparams
            and "learning_rate" in hparams
            and "batch_size" in user_passed_args
            and "learning_rate" not in user_passed_args
        ):
            old_accumulate_grad_batches = (
                hparams["accumulate_grad_batches"]
                if "accumulate_grad_batches" in hparams
                else 1
            )
            new_accumulate_grad_batches = (
                user_passed_args["accumulate_grad_batches"]
                if "accumulate_grad_batches" in user_passed_args
                else old_accumulate_grad_batches
            )
            new_tot_batch = new_accumulate_grad_batches * user_passed_args["batch_size"]
            old_tot_batch = old_accumulate_grad_batches * hparams["batch_size"]
            if new_tot_batch != old_tot_batch:
                scaling = new_tot_batch / old_tot_batch
                user_passed_args["learning_rate"] = hparams["learning_rate"] * scaling
                logger.info(
                    f"🔧 A `batch_size*accumulate_grad_batches` ({new_tot_batch}) differs from hparams file ({old_tot_batch}). "
                    f"Scaling learning_rate from {hparams['learning_rate']} to {user_passed_args['learning_rate']} (= {hparams['learning_rate']} × {new_tot_batch} / {old_tot_batch})"
                )

        return AttributeDict(**{**old_args, **hparams, **user_passed_args})
예제 #7
0
 def hparams(self) -> AttributeDict:
     if not hasattr(self, "_hparams"):
         self._hparams = AttributeDict()
     return self._hparams
    def run_training_batch(self, batch, batch_idx, dataloader_idx):
        # track grad norms
        grad_norm_dic = {}

        # track all metrics for callbacks
        batch_callback_metrics = []

        # track metrics to log
        batch_log_metrics = []

        # bookkeeping
        using_results_obj = False
        self.trainer.hiddens = None

        # track all outputs across time and num of optimizers
        batch_outputs = [[]
                         for _ in range(len(self.get_optimizers_iterable()))]

        if batch is None:
            return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic)

        # hook
        response = self.trainer.call_hook('on_batch_start')
        if response == -1:
            return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)

        # hook
        response = self.trainer.call_hook('on_train_batch_start', batch,
                                          batch_idx, dataloader_idx)
        if response == -1:
            return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)

        # lightning module hook
        splits = self.tbptt_split_batch(batch)

        for split_idx, split_batch in enumerate(splits):
            self.trainer.split_idx = split_idx

            # loop over optimizers
            for opt_idx, optimizer in self.get_optimizers_iterable():
                # make sure only the gradients of the current optimizer's parameters are calculated
                # in the training step to prevent dangling gradients in multiple-optimizer setup.
                if len(self.trainer.optimizers) > 1:
                    for param in self.trainer.get_model().parameters():
                        param.requires_grad = False
                    for group in optimizer.param_groups:
                        for param in group['params']:
                            param.requires_grad = True

                # -------------------
                # calculate loss (train step + train step end)
                # -------------------
                opt_closure_result = self.training_step_and_backward(
                    split_batch, batch_idx, opt_idx, optimizer,
                    self.trainer.hiddens)

                using_results_obj = isinstance(
                    opt_closure_result.training_step_output, Result)

                # log metrics
                self.log_training_step_metrics(opt_closure_result,
                                               batch_callback_metrics,
                                               batch_log_metrics)

                # track hiddens
                self.trainer.hiddens = self.process_hiddens(opt_closure_result)

                # check if loss or model weights are nan
                if self.trainer.terminate_on_nan:
                    self.trainer.detect_nan_tensors(opt_closure_result.loss)

                # track total loss for logging (avoid mem leaks)
                self.accumulated_loss.append(opt_closure_result.loss)

                # track all the outputs across all steps
                batch_opt_idx = opt_idx if len(batch_outputs) > 1 else 0
                batch_outputs[batch_opt_idx].append(
                    opt_closure_result.training_step_output_for_epoch_end)

                # ------------------------------
                # BACKWARD PASS
                # ------------------------------
                # gradient update with accumulated gradients
                accumulation_done = (
                    self.trainer.batch_idx +
                    1) % self.trainer.accumulate_grad_batches == 0
                is_final_batch = (self.trainer.batch_idx +
                                  1) == self.trainer.num_training_batches
                if accumulation_done or is_final_batch:
                    # hook
                    grad_norm_dic = self.on_before_backward(
                        batch_idx, optimizer)

                    # wrap forward + backward pass in closure for 2nd order optimizers
                    train_step_and_backward_closure = lambda: self.training_step_and_backward(
                        split_batch,
                        batch_idx,
                        opt_idx,
                        optimizer,
                        self.trainer.hiddens,
                    ).loss

                    # optimizer step
                    self.optimizer_step(optimizer, opt_idx, batch_idx,
                                        train_step_and_backward_closure)

                    # hook
                    self.on_before_zero_grad(optimizer)

                    # clear gradients
                    self.optimizer_zero_grad(batch_idx, optimizer, opt_idx)

                    # calculate running loss for display
                    self.running_loss.append(
                        self.accumulated_loss.mean() *
                        self.trainer.accumulate_grad_batches)

                    # reset for next set of accumulated grads
                    self.accumulated_loss.reset()

        # collapse all metrics into one dict
        batch_log_metrics = {
            k: v
            for d in batch_log_metrics for k, v in d.items()
        }

        # track all metrics for callbacks
        # TODO: is this needed?
        self.trainer.logger_connector.callback_metrics.update({
            k: v
            for d in batch_callback_metrics for k, v in d.items()
            if v is not None
        })

        result = AttributeDict(
            signal=0,
            grad_norm_dic=grad_norm_dic,
            batch_log_metrics=batch_log_metrics,
            training_step_output_for_epoch_end=batch_outputs)
        return result
예제 #9
0
    def run_training_batch(self, batch, batch_idx):
        """

        :param batch: dict; contains three keys: input_ids, attention_mask, decoder_input_ids
            Example for 'batch':
                batch: {'input_ids': tensor([[  0,  36, 230,  ...,   8,  41,   2]]),
                'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]]),
                'decoder_input_ids': tensor([[    0,   287,    10,  2107,   111, 10468,   226, 47385, 11579,  1012,
                                                2156,     5,  5302, 47385,   281, 47385, 10003,   255, 47385,   347,
                                                111,  2107, 47385,   574, 47385,  1000, 47385,   398, 47385,   245,
                                                16,    10,   205,  1374, 12576,   479,   646,  1000,  1215,  3388,
                                                510,   742,    85,   128,   579,    65,     9,     5,   357,  3092,
                                                23,    63,  1836,    11,     5,  3555,   111,   672,  2156, 26180,
                                                47385,   642,   111,  3547,  4120,   479,   646,  1000,  1215,  3388,
                                                510,   742,  7192,  8806, 10262,  3444,  7951,  2170,  1318,     2]])}
        :param batch_idx: number of batch
        :return:
        """
        # load tokenizer
        tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
        # load config for GSM
        config = yaml_load(f"{self.default_root_dir}/data/config/gsm.yaml")
        # load dict
        dictionary = Dictionary.load(datapath('dict-www-cnndm-unigram'))
        # remove [SEP]
        sep_list = [
            '[SEP_0]', '[SEP_1]', '[SEP_2]', '[SEP_3]', '[SEP_4]', '[SEP_5]',
            '[SEP_6]', '[SEP_7]', '[SEP_8]', '[SEP_9]', '<S_SEP>'
        ]
        # vocab size for topic modeling
        vocab_size = len(dictionary)
        # model
        config['hidden']['features'][0] = vocab_size

        # trainer batch
        config['trainer_batch']['test_sample'] = 1
        config = extend_config_reference(config)
        gsm_trainer = config['GSMtrainer']
        gsm_trainer[
            'base_dir'] = f"{self.default_root_dir}/log/bart-large-cnn-finetune"
        gsm_trainer = GSMTrainer.from_config(gsm_trainer)

        # number of topics
        K = config['gsmtopic']['k']

        # yaml_dump(gsm_trainer,
        #           os.path.join(f"{self.default_root_dir}/log/bart-large-cnn-finetune", "gsm_trainer.yaml"))

        # -----------------------------------------
        # Topic Modeling - GSM
        # -----------------------------------------
        batch_size = batch['input_ids'].size()[0]

        docs = []
        for batch_num in range(batch_size):
            # extract the batch_sentence
            batch_sentence = tokenizer.decode(
                batch['input_ids'][batch_num].tolist(),
                skip_special_tokens=True)
            # change to lowercase and split to list
            batch_sentence_list = batch_sentence.split(" ")
            # remove [SEP]
            batch_sentence_list_nosep = [
                item for item in batch_sentence_list if item not in sep_list
            ]
            text = ' '.join([x for x in batch_sentence_list_nosep])
            fine_text = text.replace(' ##', '').lower()
            batch_sentence = re.sub(r'[^\w\s]', '', fine_text)
            # batch_sentence: change to the cleaned news for topic modeling
            # change to training data format in topic modeling
            gsm_data_bow = dictionary.doc2bow(batch_sentence.split(" "))
            docs.append(gsm_data_bow)

        # gsm_data: data for topic modeling
        gsm_data = DataLoader(DocDataset(docs, len(dictionary), device='cuda'),
                              batch_size=config['dataset']['batch_size'],
                              drop_last=False,
                              num_workers=0)

        gsm_trainer.__dict__['train_iterator'] = gsm_data

        gsm_loss, gsm_p = gsm_trainer.co_train(vocab_size, training=True)

        del gsm_data

        # track grad norms
        grad_norm_dic = {}

        # track all metrics for callbacks
        batch_callback_metrics = []

        # track metrics to log
        batch_log_metrics = []

        if batch is None:
            return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic)

        # Batch start events
        with self.profiler.profile('on_batch_start'):
            # callbacks
            self.on_batch_start()
            # hooks
            if self.is_function_implemented('on_batch_start'):
                response = self.get_model().on_batch_start(batch)
                if response == -1:
                    return AttributeDict(signal=-1,
                                         grad_norm_dic=grad_norm_dic)

        splits = [batch]
        if self.truncated_bptt_steps is not None:
            model_ref = self.get_model()
            with self.profiler.profile('tbptt_split_batch'):
                splits = model_ref.tbptt_split_batch(batch,
                                                     self.truncated_bptt_steps)

        self.hiddens = None
        for split_idx, split_batch in enumerate(splits):
            self.split_idx = split_idx

            for opt_idx, optimizer in self._get_optimizers_iterable():
                # make sure only the gradients of the current optimizer's parameters are calculated
                # in the training step to prevent dangling gradients in multiple-optimizer setup.
                if len(self.optimizers) > 1:
                    for param in self.get_model().parameters():
                        param.requires_grad = False
                    for group in optimizer.param_groups:
                        for param in group['params']:
                            param.requires_grad = True

                # -------------------
                # calculate loss
                # -------------------
                beta = 0.01
                opt_closure_result = self.optimizer_closure(
                    split_batch,
                    batch_idx,
                    opt_idx,
                    optimizer,
                    self.hiddens,
                    gsm_p,  # topic distribution
                    gsm_loss,  # loss for topic modeling
                    K,  # number of topics
                    beta,
                )

                # ------------------------------
                # POST forward bookkeeping
                # ------------------------------
                batch_callback_metrics.append(
                    opt_closure_result.training_step_output.callback_metrics)
                batch_log_metrics.append(
                    opt_closure_result.training_step_output.log_metrics)

                self.add_progress_bar_metrics(
                    opt_closure_result.training_step_output.pbar_on_batch_end)

                # track hiddens
                self.hiddens = opt_closure_result.hiddens

                # check if loss or model weights are nan
                if self.terminate_on_nan:
                    self.detect_nan_tensors(opt_closure_result.loss)

                # track total loss for logging (avoid mem leaks)
                self.batch_loss_value.append(opt_closure_result.loss)

                # ------------------------------
                # BACKWARD PASS
                # ------------------------------
                # gradient update with accumulated gradients
                if (self.batch_idx + 1) % self.accumulate_grad_batches == 0:
                    # backward
                    grad_norm_dic = self.run_batch_backward_pass(
                        split_batch, batch_idx, opt_idx, optimizer)

                    # calculate running loss for display
                    self.running_loss.append(self.batch_loss_value.mean())

                    # reset for next set of accumulated grads
                    self.batch_loss_value.reset()

        # Batch end events
        with self.profiler.profile('on_batch_end'):
            # callbacks
            self.on_batch_end()
            # model hooks
            if self.is_function_implemented('on_batch_end'):
                self.get_model().on_batch_end()

        # collapse all metrics into one dict
        batch_log_metrics = {
            k: v
            for d in batch_log_metrics for k, v in d.items()
        }

        # track all metrics for callbacks
        self.callback_metrics.update(
            {k: v
             for d in batch_callback_metrics for k, v in d.items()})

        result = AttributeDict(
            signal=0,
            grad_norm_dic=grad_norm_dic,
            batch_log_metrics=batch_log_metrics,
            training_step_output_for_epoch_end=opt_closure_result.
            training_step_output_for_epoch_end)
        return result
예제 #10
0
    def optimizer_closure(self,
                          split_batch,
                          batch_idx,
                          opt_idx,
                          optimizer,
                          hiddens,
                          gsm_p,
                          gsm_loss,
                          K,
                          beta=0.01):
        """
        wrap the forward step in a closure so second order methods work
        """
        # ---------------------------
        # FORWARD
        # ---------------------------
        with self.profiler.profile('model_forward'):
            if self.use_amp and NATIVE_AMP_AVALAIBLE and not self.use_tpu:
                with torch.cuda.amp.autocast():
                    training_step_output = self.training_forward(
                        split_batch, batch_idx, opt_idx, hiddens, gsm_p,
                        gsm_loss, K)
            else:
                training_step_output = self.training_forward(
                    split_batch, batch_idx, opt_idx, hiddens, gsm_p, gsm_loss,
                    K)

            # ----------------------------
            # PROCESS THE RESULT
            # ----------------------------
            # format and reduce outputs accordingly
            training_step_output_for_epoch_end = training_step_output
            training_step_output = self.process_output(training_step_output,
                                                       train=True)

            training_step_output = AttributeDict(
                batch_loss=training_step_output[0],
                pbar_on_batch_end=training_step_output[1],
                log_metrics=training_step_output[2],
                callback_metrics=training_step_output[3],
                hiddens=training_step_output[4],
            )

            # if the user decides to finally reduce things in epoch_end, save raw output without graphs
            training_step_output_for_epoch_end = recursive_detach(
                training_step_output_for_epoch_end)

        # accumulate loss
        # (if accumulate_grad_batches = 1 no effect)
        ## todo: check self.accumulate_grad_batches
        closure_loss = training_step_output.batch_loss / self.accumulate_grad_batches

        # ----------------------------
        # Calculate total loss
        # ----------------------------
        # closure_loss = (1 - beta) * closure_loss + beta * gsm_loss

        # the loss will get scaled for amp. avoid any modifications to it
        untouched_loss = closure_loss.detach().clone()

        # backward pass
        model_ref = self.get_model()
        with self.profiler.profile('model_backward'):
            # scale loss for 16 bit
            if self.precision == 16 and not self.on_tpu:
                closure_loss = model_ref.amp_scale_loss(
                    closure_loss, optimizer, opt_idx)

                # enter amp context
                if not NATIVE_AMP_AVALAIBLE:
                    context = closure_loss
                    closure_loss = closure_loss.__enter__()

            # do backward pass
            model_ref.backward(self, closure_loss, optimizer, opt_idx)

            # exit amp context
            if self.precision == 16 and not NATIVE_AMP_AVALAIBLE and not self.on_tpu:
                a, b, c = None, None, None
                error = context.__exit__(a, b, c)
                if error:
                    rank_zero_warn(a, b, c)
                    raise Exception('apex unscale error')

            # once backward has been applied, release graph
            closure_loss = closure_loss.detach()
            training_step_output.batch_loss = training_step_output.batch_loss.detach(
            )

        if self.use_horovod:
            # Synchronize Horovod to ensure gradient manipulations (e.g., loss scaling) are valid
            optimizer.synchronize()

        # insert after step hook
        if self.is_function_implemented('on_after_backward'):
            model_ref = self.get_model()
            with self.profiler.profile('on_after_backward'):
                model_ref.on_after_backward()

        result = AttributeDict(
            loss=untouched_loss,
            training_step_output=training_step_output,
            training_step_output_for_epoch_end=
            training_step_output_for_epoch_end,
            hiddens=training_step_output.hiddens,
        )
        return result
예제 #11
0
def get_config():
    parser = argparse.ArgumentParser(
        add_help=False,
        description='multi-label learning for text classification')

    # load params from config file
    parser.add_argument('-c', '--config', help='Path to configuration file')
    args, _ = parser.parse_known_args()
    config = {}
    if args.config:
        with open(args.config) as fp:
            config = yaml.load(fp, Loader=yaml.SafeLoader)

    # path / directory
    parser.add_argument(
        '--data_dir',
        default='./data/rcv1',
        help='The directory to load data (default: %(default)s)')
    parser.add_argument(
        '--result_dir',
        default='./runs',
        help='The directory to save checkpoints and logs (default: %(default)s)'
    )

    # data
    parser.add_argument('--data_name',
                        default='rcv1',
                        help='Dataset name (default: %(default)s)')
    parser.add_argument(
        '--train_path',
        help='Path to training data (default: [data_dir]/train.txt)')
    parser.add_argument(
        '--val_path',
        help='Path to validation data (default: [data_dir]/valid.txt)')
    parser.add_argument(
        '--test_path', help='Path to test data (default: [data_dir]/test.txt)')
    parser.add_argument(
        '--val_size',
        type=float,
        default=0.2,
        help=
        'Training-validation split: a ratio in [0, 1] or an integer for the size of the validation set (default: %(default)s).'
    )
    parser.add_argument(
        '--min_vocab_freq',
        type=int,
        default=1,
        help=
        'The minimum frequency needed to include a token in the vocabulary (default: %(default)s)'
    )
    parser.add_argument(
        '--max_seq_length',
        type=int,
        default=500,
        help='The maximum number of tokens of a sample (default: %(default)s)')
    parser.add_argument(
        '--shuffle',
        type=bool,
        default=True,
        help=
        'Whether to shuffle training data before each epoch (default: %(default)s)'
    )

    # train
    parser.add_argument('--seed',
                        type=int,
                        help='Random seed (default: %(default)s)')
    parser.add_argument(
        '--epochs',
        type=int,
        default=10000,
        help='Number of epochs to train (default: %(default)s)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=16,
                        help='Size of training batches (default: %(default)s)')
    parser.add_argument('--optimizer',
                        default='adam',
                        choices=['adam', 'sgd'],
                        help='Optimizer: SGD or Adam (default: %(default)s)')
    parser.add_argument(
        '--learning_rate',
        type=float,
        default=0.0001,
        help='Learning rate for optimizer (default: %(default)s)')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0,
                        help='Weight decay factor (default: %(default)s)')
    parser.add_argument(
        '--momentum',
        type=float,
        default=0.9,
        help='Momentum factor for SGD only (default: %(default)s)')
    parser.add_argument(
        '--patience',
        type=int,
        default=5,
        help=
        'Number of epochs to wait for improvement before early stopping (default: %(default)s)'
    )

    # model
    parser.add_argument('--model_name',
                        default='KimCNN',
                        help='Model to be used (default: %(default)s)')
    parser.add_argument(
        '--init_weight',
        default='kaiming_uniform',
        help='Weight initialization to be used (default: %(default)s)')
    parser.add_argument(
        '--activation',
        default='relu',
        help='Activation function to be used (default: %(default)s)')
    parser.add_argument(
        '--num_filter_per_size',
        type=int,
        default=128,
        help=
        'Number of filters in convolutional layers in each size (default: %(default)s)'
    )
    parser.add_argument(
        '--filter_sizes',
        type=int,
        nargs='+',
        default=[4],
        help='Size of convolutional filters (default: %(default)s)')
    parser.add_argument(
        '--dropout',
        type=float,
        default=0.2,
        help='Optional specification of dropout (default: %(default)s)')
    parser.add_argument(
        '--dropout2',
        type=float,
        default=0.2,
        help=
        'Optional specification of the second dropout (default: %(default)s)')
    parser.add_argument(
        '--num_pool',
        type=int,
        default=1,
        help='Number of pool for dynamic max-pooling (default: %(default)s)')

    # eval
    parser.add_argument(
        '--eval_batch_size',
        type=int,
        default=256,
        help='Size of evaluating batches (default: %(default)s)')
    parser.add_argument(
        '--metric_threshold',
        type=float,
        default=0.5,
        help='Thresholds to monitor for metrics (default: %(default)s)')
    parser.add_argument(
        '--monitor_metrics',
        nargs='+',
        default=['P@1', 'P@3', 'P@5'],
        help='Metrics to monitor while validating (default: %(default)s)')
    parser.add_argument(
        '--val_metric',
        default='P@1',
        help='The metric to monitor for early stopping (default: %(default)s)')

    # pretrained vocab / embeddings
    parser.add_argument(
        '--vocab_file',
        type=str,
        help='Path to a file holding vocabuaries (default: %(default)s)')
    parser.add_argument(
        '--embed_file',
        type=str,
        help=
        'Path to a file holding pre-trained embeddings (default: %(default)s)')
    parser.add_argument(
        '--label_file',
        type=str,
        help='Path to a file holding all labels (default: %(default)s)')

    # log
    parser.add_argument(
        '--save_k_predictions',
        type=int,
        nargs='?',
        const=100,
        default=0,
        help=
        'Save top k predictions on test set. k=%(const)s if not specified. (default: %(default)s)'
    )
    parser.add_argument(
        '--predict_out_path',
        help=
        'Path to the an output file holding top 100 label results (default: %(default)s)'
    )

    # others
    parser.add_argument('--cpu', action='store_true', help='Disable CUDA')
    parser.add_argument('--silent',
                        action='store_true',
                        help='Enable silent mode')
    parser.add_argument(
        '--data_workers',
        type=int,
        default=4,
        help='Use multi-cpu core for data pre-processing (default: %(default)s)'
    )
    parser.add_argument(
        '--embed_cache_dir',
        type=str,
        help=
        'For parameter search only: path to a directory for storing embeddings for multiple runs. (default: %(default)s)'
    )
    parser.add_argument(
        '--eval',
        action='store_true',
        help='Only run evaluation on the test set (default: %(default)s)')
    parser.add_argument(
        '--checkpoint_path',
        help='The checkpoint to warm-up with (default: %(default)s)')
    parser.add_argument('-h', '--help', action='help')

    parser.set_defaults(**config)
    args = parser.parse_args()
    config = AttributeDict(vars(args))
    return config
예제 #12
0
DEFAULT_MODEL_PARAMS = AttributeDict(
    **{
        "name":
        "PoseNet",
        "feature_extractor":
        AttributeDict(pretrained=True),
        "criterion": {
            "name": "SE3Criterion",
            "rotation_koef": -3.0,
            "translation_koef": -3.0,
            "use_se3_translation": True,
            "loss_type": "l2",
            "koef_requires_grad": True,
            "lr": 0.0001
        },
        "feature_dimension":
        2048,
        "drop_rate":
        0,
        "optimizer":
        AttributeDict(
            betas="0.9 0.999",
            lr=0.0001,
            weight_decay=0.0005,
        ),
        "scheduler": {
            "step_size": 20,
            "gamma": 0.5,
        },
        "bias":
        True,
        "activation":
        "tanh",
        "pretrained":
        True
    })
예제 #13
0
    def run(self, args: AttributeDict):
        """Run hyperparameter search using the `tune.schedulers.ASHAScheduler`

        Args:
            args (AttributeDict): Arguments

        Side-effects:
            Saves logs to `TUNE_LOGS_PATH / args.id`
        """
        try:
            from ray import tune
            from ray.tune.integration.pytorch_lightning import (
                TuneReportCheckpointCallback,
            )
        except ModuleNotFoundError as e:  # pragma: no cover
            logger.error(
                "To use hyperparameter search, first install Ray Tune via `pip install 'ray[tune]'` or `pip install 'ride[extras]'`"
            )
            raise e

        if not hasattr(args, "id"):
            args.id = "hparamsearch"

        module_config = (
            Configs.from_file(args.from_hparam_space_file)
            if args.from_hparam_space_file
            else self.Module.configs()
        ).tune_config()

        config = {
            **dict(args),
            **module_config,
            # pl.Trainer args:
            "gpus": args.gpus_per_trial,
            "logger": False,
            "accumulate_grad_batches": (
                (8 // args.gpus_per_trial) * args.accumulate_grad_batches
                if args.gpus_per_trial
                else args.accumulate_grad_batches
            ),
        }
        scheduler = tune.schedulers.ASHAScheduler(
            metric=f"val/{args.optimization_metric}",
            mode=self.Module.metrics()[args.optimization_metric].value,
            max_t=args.max_epochs,
            grace_period=1,
            reduction_factor=2,
        )

        metric_names = [f"val/{m}" for m in self.Module.metrics().keys()]

        reporter = tune.CLIReporter(
            metric_columns=[*metric_names, "training_iteration"],
        )
        tune_callbacks = [
            TuneReportCheckpointCallback(
                metrics=metric_names,
                filename="checkpoint",
                on="validation_end",
            )
        ]
        cpus_per_trial = max(
            1,
            (
                min(10 * args.gpus_per_trial, NUM_CPU - 10)
                if args.gpus_per_trial
                else min(10, NUM_CPU - 2)
            ),
        )

        analysis = tune.run(
            partial(
                Runner.static_train_and_val,
                self.Module,
                trainer_callbacks=tune_callbacks,
            ),
            name=args.id,
            local_dir=str(TUNE_LOGS_PATH),
            resources_per_trial={"cpu": cpus_per_trial, "gpu": args.gpus_per_trial},
            config=config,
            num_samples=args.trials,
            scheduler=scheduler,
            progress_reporter=reporter,
            raise_on_failed_trial=False,
        )

        best_hparams = analysis.get_best_config(
            metric=f"val/{args.optimization_metric}",
            mode=self.Module.metrics()[args.optimization_metric].value,
            scope="all",
        )
        # Select only model parameters
        if best_hparams:
            best_hparams = {
                k: best_hparams[k]
                for k in [
                    *self.Module.configs().names,
                    # Trainer parameters that influence model hparams:
                    "accumulate_grad_batches",
                    "batch_size",
                    "gpus",
                ]
            }
        return best_hparams
예제 #14
0
    def run_training_batch(self, batch, batch_idx):
        # track grad norms
        grad_norm_dic = {}

        # track all metrics for callbacks
        batch_callback_metrics = []

        # track metrics to log
        batch_log_metrics = []

        using_results_obj = False

        # track all outputs across time and num of optimizers
        batch_outputs = [[] for i in range(len(self._get_optimizers_iterable()))]

        if batch is None:
            return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic)

        # Batch start events
        # TODO: deprecate 1.0
        with self.profiler.profile('on_batch_start'):
            # callbacks
            self.on_batch_start()
            # hooks
            if self.is_function_implemented('on_batch_start'):
                response = self.get_model().on_batch_start(batch)
                if response == -1:
                    return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)

        with self.profiler.profile('on_train_batch_start'):
            # forward support for multiple loaders
            dataloader_idx = 0
            self.on_train_batch_start(batch, batch_idx, dataloader_idx)
            # hooks
            if self.is_function_implemented('on_train_batch_start'):
                response = self.get_model().on_train_batch_start(batch, batch_idx, dataloader_idx)
                if response == -1:
                    return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)

        splits = [batch]
        if self.truncated_bptt_steps is not None:
            model_ref = self.get_model()
            with self.profiler.profile('tbptt_split_batch'):
                splits = model_ref.tbptt_split_batch(batch, self.truncated_bptt_steps)

        self.hiddens = None
        for split_idx, split_batch in enumerate(splits):
            self.split_idx = split_idx

            for opt_idx, optimizer in self._get_optimizers_iterable():
                # make sure only the gradients of the current optimizer's parameters are calculated
                # in the training step to prevent dangling gradients in multiple-optimizer setup.
                if len(self.optimizers) > 1:
                    for param in self.get_model().parameters():
                        param.requires_grad = False
                    for group in optimizer.param_groups:
                        for param in group['params']:
                            param.requires_grad = True

                # -------------------
                # calculate loss (train step + train step end)
                # -------------------
                opt_closure_result = self.optimizer_closure(
                    split_batch,
                    batch_idx,
                    opt_idx,
                    optimizer,
                    self.hiddens
                )
                using_results_obj = isinstance(opt_closure_result.training_step_output, Result)

                # ------------------------------
                # POST forward bookkeeping
                # ------------------------------
                batch_callback_metrics.append(opt_closure_result.training_step_output.callback_metrics)

                # add metrics to loggers
                if using_results_obj:
                    metrics_to_log = opt_closure_result.training_step_output.batch_log_metrics
                    step_pbar_metrics = opt_closure_result.training_step_output.batch_pbar_metrics
                else:
                    metrics_to_log = opt_closure_result.training_step_output.log_metrics
                    step_pbar_metrics = opt_closure_result.training_step_output.pbar_on_batch_end

                # track metrics
                batch_log_metrics.append(metrics_to_log)
                if len(step_pbar_metrics) > 0:
                    self.add_progress_bar_metrics(step_pbar_metrics)

                # track hiddens
                self.hiddens = opt_closure_result.hiddens

                if using_results_obj:
                    opt_closure_result.training_step_output_for_epoch_end.drop_hiddens()

                # check if loss or model weights are nan
                if self.terminate_on_nan:
                    self.detect_nan_tensors(opt_closure_result.loss)

                # track total loss for logging (avoid mem leaks)
                self.batch_loss_value.append(opt_closure_result.loss)

                # track all the outputs across all steps
                batch_outputs[opt_idx].append(opt_closure_result.training_step_output_for_epoch_end)

                # ------------------------------
                # BACKWARD PASS
                # ------------------------------
                # gradient update with accumulated gradients
                if ((self.batch_idx + 1) % self.accumulate_grad_batches == 0
                        or (self.batch_idx + 1) == self.num_training_batches):

                    # backward
                    grad_norm_dic = self.run_batch_backward_pass(split_batch, batch_idx, opt_idx, optimizer)

                    # calculate running loss for display
                    self.running_loss.append(self.batch_loss_value.mean() * self.accumulate_grad_batches)

                    # reset for next set of accumulated grads
                    self.batch_loss_value.reset()

        # Batch end events
        with self.profiler.profile('on_batch_end'):
            # callbacks
            self.on_batch_end()
            # model hooks
            if self.is_function_implemented('on_batch_end'):
                self.get_model().on_batch_end()

        with self.profiler.profile('on_train_batch_end'):
            # forward support for multiple loaders
            dataloader_idx = 0
            self.on_train_batch_end(batch, batch_idx, dataloader_idx)
            # model hooks
            if self.is_function_implemented('on_train_batch_end'):
                self.get_model().on_train_batch_end(batch, batch_idx, dataloader_idx)

        # collapse all metrics into one dict
        batch_log_metrics = {k: v for d in batch_log_metrics for k, v in d.items()}

        # track all metrics for callbacks
        if not using_results_obj:
            self.callback_metrics.update({k: v for d in batch_callback_metrics for k, v in d.items()})

        result = AttributeDict(
            signal=0,
            grad_norm_dic=grad_norm_dic,
            batch_log_metrics=batch_log_metrics,
            training_step_output_for_epoch_end=batch_outputs
        )
        return result
예제 #15
0
 def __init__(self, trainer, cluster_environment=None):
     self.trainer = trainer
     self.cluster_environment = cluster_environment
     self.dist = AttributeDict(rank=0, device=None)
예제 #16
0
    def optimizer_closure(self, split_batch, batch_idx, opt_idx, optimizer, hiddens):
        """
        wrap the forward step in a closure so second order methods work
        """
        # ---------------------------
        # FORWARD (TRAINING STEP + TRAIN STEP END)
        # ---------------------------
        with self.profiler.profile('model_forward'):
            args = self.build_train_args(split_batch, batch_idx, opt_idx, hiddens)
            training_step_output = self.accelerator_backend.training_step(args)
            training_step_output = self.call_hook('training_step_end', training_step_output)

            # ----------------------------
            # PROCESS THE RESULT
            # ----------------------------
            # format and reduce outputs accordingly
            training_step_output_for_epoch_end = training_step_output
            is_result_obj = isinstance(training_step_output, Result)

            # track batch size for weighted average
            if is_result_obj:
                training_step_output.track_batch_size(len(split_batch))

            # don't allow EvalResult in the training_step
            if isinstance(training_step_output, EvalResult):
                raise MisconfigurationException('training_step cannot return EvalResult, '
                                                'use a dict or TrainResult instead')

            # handle regular dicts
            if not is_result_obj:
                training_step_output = self.process_output(training_step_output, train=True)

                training_step_output = AttributeDict(
                    batch_loss=training_step_output[0],
                    pbar_on_batch_end=training_step_output[1],
                    log_metrics=training_step_output[2],
                    callback_metrics=training_step_output[3],
                    hiddens=training_step_output[4],
                )

            # if the user decides to finally reduce things in epoch_end, save raw output without graphs
            if isinstance(training_step_output_for_epoch_end, torch.Tensor):
                training_step_output_for_epoch_end = training_step_output_for_epoch_end.detach()
            elif is_result_obj:
                training_step_output_for_epoch_end = copy(training_step_output)
                training_step_output_for_epoch_end.detach()
            else:
                training_step_output_for_epoch_end = recursive_detach(training_step_output_for_epoch_end)

        # accumulate loss
        # (if accumulate_grad_batches = 1 no effect)
        closure_loss = training_step_output.minimize if is_result_obj else training_step_output.batch_loss
        closure_loss = closure_loss / self.accumulate_grad_batches

        # the loss will get scaled for amp. avoid any modifications to it
        untouched_loss = closure_loss.detach().clone()

        # backward pass
        model_ref = self.get_model()
        with self.profiler.profile('model_backward'):
            # scale loss for 16 bit
            if self.precision == 16 and not self.on_tpu:
                closure_loss = model_ref.amp_scale_loss(closure_loss, optimizer, opt_idx, amp_backend=self.amp_backend)

                # enter amp context
                if self.amp_backend == AMPType.APEX:
                    self.dev_debugger.track_event('AMP', str(AMPType.APEX))
                    context = closure_loss
                    closure_loss = closure_loss.__enter__()

            # do backward pass
            model_ref.backward(self, closure_loss, optimizer, opt_idx)

            # exit amp context
            if self.precision == 16 and self.amp_backend == AMPType.APEX and not self.on_tpu:
                a, b, c = None, None, None
                error = context.__exit__(a, b, c)
                if error:
                    rank_zero_warn(a, b, c)
                    raise Exception('apex unscale error')

            # once backward has been applied, release graph
            closure_loss = closure_loss.detach()

            if is_result_obj:
                training_step_output.detach()
            else:
                training_step_output.batch_loss = training_step_output.batch_loss.detach()

        if self.use_horovod:
            # Synchronize Horovod to ensure gradient manipulations (e.g., loss scaling) are valid
            optimizer.synchronize()

        # insert after step hook
        if self.is_function_implemented('on_after_backward'):
            model_ref = self.get_model()
            with self.profiler.profile('on_after_backward'):
                model_ref.on_after_backward()

        # when in dev debugging track the losses
        self.dev_debugger.track_train_loss_history(batch_idx, untouched_loss.detach())

        result = AttributeDict(
            loss=untouched_loss,
            training_step_output=training_step_output,
            training_step_output_for_epoch_end=training_step_output_for_epoch_end,
            hiddens=training_step_output.hiddens,
        )
        return result
예제 #17
0
    data = MedNIST(hparams)

    # embed()

    model = YourModel(hparams)

    trainer.fit(model, data)


if __name__ == "__main__":
    hparams = AttributeDict({
        'accel': None,
        'autolr': False,
        'batch_size': 2,
        'check_val_n': 1,
        'dev': False,
        'gpus': None,
        'log_path': DATADIR.joinpath('logs'),
        'lr': 0.0001,
        'lr_schedule': 'ROP',
        'max_epochs': 100,
        'num_nodes': 1,
        'num_workers': 0,
        'pl_ver': pl.__version__,
        'seed': 22117,
        'weight_decay': 1e-07
    })

    train(hparams)
예제 #18
0
    def run_training_batch(self, batch, batch_idx, dataloader_idx):
        # track grad norms
        grad_norm_dic = {}

        # bookkeeping
        self.trainer.hiddens = None

        # track all outputs across time and num of optimizers
        batch_outputs = [[] for _ in range(len(self.get_optimizers_iterable()))]

        if batch is None:
            return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic)

        # hook
        response = self.trainer.call_hook("on_batch_start")
        if response == -1:
            return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)

        # hook
        response = self.trainer.call_hook("on_train_batch_start", batch, batch_idx, dataloader_idx)
        if response == -1:
            return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)

        # lightning module hook
        splits = self.tbptt_split_batch(batch)

        for split_idx, split_batch in enumerate(splits):

            # create an iterable for optimizers and loop over them
            for opt_idx, optimizer in self.prepare_optimizers():

                # toggle model params + set info to logger_connector
                self.run_train_split_start(split_idx, split_batch, opt_idx, optimizer)

                if self.should_accumulate():
                    # For gradient accumulation

                    # -------------------
                    # calculate loss (train step + train step end)
                    # -------------------

                    # automatic_optimization=True: perform dpp sync only when performing optimizer_step
                    # automatic_optimization=False: don't block synchronization here
                    with self.block_ddp_sync_behaviour():
                        self.training_step_and_backward(
                            split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens
                        )

                    batch_outputs = self._process_closure_result(
                        batch_outputs=batch_outputs,
                        opt_idx=opt_idx,
                    )

                # ------------------------------
                # BACKWARD PASS
                # ------------------------------
                # gradient update with accumulated gradients

                else:
                    if self.automatic_optimization:

                        def train_step_and_backward_closure():
                            result = self.training_step_and_backward(
                                split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens
                            )
                            return None if result is None else result.loss

                        # optimizer step
                        self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)

                    else:
                        self._curr_step_result = self.training_step(
                            split_batch, batch_idx, opt_idx, self.trainer.hiddens
                        )

                    if self._curr_step_result is None:
                        # user decided to skip optimization
                        # make sure to zero grad.
                        continue

                    batch_outputs = self._process_closure_result(
                        batch_outputs=batch_outputs,
                        opt_idx=opt_idx,
                    )

                    # todo: Properly aggregate grad_norm accros opt_idx and split_idx
                    grad_norm_dic = self._cur_grad_norm_dict
                    self._cur_grad_norm_dict = None

                    # update running loss + reset accumulated loss
                    self.update_running_loss()

        result = AttributeDict(
            signal=0,
            grad_norm_dic=grad_norm_dic,
            training_step_output_for_epoch_end=batch_outputs,
        )
        return result
    def optimizer_closure(self, split_batch, batch_idx, opt_idx, optimizer,
                          hiddens):
        """
        wrap the forward step in a closure so second order methods work
        """
        # ---------------------------
        # FORWARD
        # ---------------------------
        with self.profiler.profile('model_forward'):
            if self.use_amp and NATIVE_AMP_AVALAIBLE:
                with torch.cuda.amp.autocast():
                    training_step_output = self.training_forward(
                        split_batch, batch_idx, opt_idx, hiddens)
            else:
                training_step_output = self.training_forward(
                    split_batch, batch_idx, opt_idx, hiddens)

            # ----------------------------
            # PROCESS THE RESULT
            # ----------------------------
            # format and reduce outputs accordingly
            training_step_output = self.process_output(training_step_output,
                                                       train=True)

            # TODO: temporary part of structured results PR
            training_step_output = AttributeDict(
                batch_loss=training_step_output[0],
                pbar_on_batch_end=training_step_output[1],
                log_metrics=training_step_output[2],
                callback_metrics=training_step_output[3],
                hiddens=training_step_output[4],
            )

            # if the user decides to finally reduce things in epoch_end, save raw output without graphs
            training_step_output_for_epoch_end = recursive_detach(
                training_step_output)

        # accumulate loss
        # (if accumulate_grad_batches = 1 no effect)
        closure_loss = training_step_output.batch_loss / self.accumulate_grad_batches

        # backward pass
        model_ref = self.get_model()
        with self.profiler.profile('model_backward'):
            # scale loss for 16 bit
            if self.precision == 16 and not self.on_tpu:
                closure_loss = model_ref.amp_scale_loss(
                    closure_loss, optimizer, opt_idx)

            # do backward pass
            model_ref.backward(self, closure_loss, optimizer, opt_idx)

            # once backward has been applied, release graph
            closure_loss = closure_loss.detach()
            training_step_output.batch_loss = training_step_output.batch_loss.detach(
            )

        if self.use_horovod:
            # Synchronize Horovod to ensure gradient manipulations (e.g., loss scaling) are valid
            optimizer.synchronize()

        # insert after step hook
        if self.is_function_implemented('on_after_backward'):
            model_ref = self.get_model()
            with self.profiler.profile('on_after_backward'):
                model_ref.on_after_backward()

        result = AttributeDict(
            loss=closure_loss,
            training_step_output=training_step_output,
            training_step_output_for_epoch_end=
            training_step_output_for_epoch_end,
            hiddens=training_step_output.hiddens,
        )
        return result
    def training_step(self, split_batch, batch_idx, opt_idx, hiddens):
        with self.trainer.profiler.profile('model_forward'):
            args = self.build_train_args(split_batch, batch_idx, opt_idx,
                                         hiddens)
            training_step_output = self.trainer.accelerator_backend.training_step(
                args)
            training_step_output = self.trainer.call_hook(
                'training_step_end', training_step_output)

            # ----------------------------
            # PROCESS THE RESULT
            # ----------------------------
            # format and reduce outputs accordingly
            training_step_output_for_epoch_end = training_step_output
            is_result_obj = isinstance(training_step_output, Result)

            # track batch size for weighted average
            if is_result_obj:
                training_step_output.track_batch_size(len(split_batch))

            # don't allow EvalResult in the training_step
            if isinstance(training_step_output, EvalResult):
                raise MisconfigurationException(
                    'training_step cannot return EvalResult, '
                    'use a dict or TrainResult instead')

            # handle regular dicts
            if not is_result_obj:
                training_step_output = self.trainer.process_output(
                    training_step_output, train=True)

                training_step_output = AttributeDict(
                    batch_loss=training_step_output[0],
                    pbar_on_batch_end=training_step_output[1],
                    log_metrics=training_step_output[2],
                    callback_metrics=training_step_output[3],
                    hiddens=training_step_output[4],
                )

            # if the user decides to finally reduce things in epoch_end, save raw output without graphs
            if isinstance(training_step_output_for_epoch_end, torch.Tensor):
                training_step_output_for_epoch_end = training_step_output_for_epoch_end.detach(
                )
            elif is_result_obj:
                training_step_output_for_epoch_end = copy(training_step_output)
                training_step_output_for_epoch_end.detach()
            else:
                training_step_output_for_epoch_end = recursive_detach(
                    training_step_output_for_epoch_end)

        # accumulate loss
        # (if accumulate_grad_batches = 1 no effect)
        closure_loss = training_step_output.minimize if is_result_obj else training_step_output.batch_loss
        closure_loss = closure_loss / self.trainer.accumulate_grad_batches

        # the loss will get scaled for amp. avoid any modifications to it
        untouched_loss = closure_loss.detach().clone()

        # result
        result = AttributeDict(
            closure_loss=closure_loss,
            loss=untouched_loss,
            training_step_output=training_step_output,
            training_step_output_for_epoch_end=
            training_step_output_for_epoch_end,
            hiddens=training_step_output.hiddens,
        )
        return result
예제 #21
0
def main(params):
    # Display date and time
    print(datetime.datetime.now())
    print(AttributeDict(vars(params)))

    # Load training data
    train_dataset = COCO(
        directory_a=params.modified,
        directory_b=params.original,
        num_training=params.num_training,
        num_validation=params.num_validation,
        size=params.size,
        channels=params.channels,
        shuffle=params.shuffle,
        cache=params.cache,
        validation=False
    )
    train_loader = setup.load(train_dataset, params.batch_size, False)

    # Load validation data
    val_dataset = COCO(
        directory_a=params.modified,
        directory_b=params.original,
        num_training=params.num_training,
        num_validation=params.num_validation,
        size=params.size,
        channels=params.channels,
        shuffle=params.shuffle,
        cache=params.cache,
        validation=True
    )
    val_loader = setup.load(val_dataset, params.batch_size, False)

    # Create model
    model = CycleGAN(
        train_loader=train_loader,
        val_loader=val_loader,
        batch_size=params.batch_size,
        iterations=params.epochs * params.num_training,
        in_channels=params.channels,
        out_channels=params.channels,
        g_filters=params.g_filters,
        d_filters=params.d_filters,
        residual_blocks=params.residual_blocks,
        dropout=params.dropout,
        skip=params.skip,
        learning_rate=params.lr,
        beta_1=params.b1,
        beta_2=params.b2,
        init_type=params.init_type,
        init_scale=params.init_scale,
        pool_size_a=params.pool_size_a,
        pool_size_b=params.pool_size_b,
        lambda_dis_a=params.lambda_dis_a,
        lambda_dis_b=params.lambda_dis_b,
        lambda_gen_a=params.lambda_gen_a,
        lambda_gen_b=params.lambda_gen_b,
        lambda_cycle_a=params.lambda_cycle_a,
        lambda_cycle_b=params.lambda_cycle_b,
        lambda_id_a=params.lambda_id_a,
        lambda_id_b=params.lambda_id_b,
        shuffle=params.shuffle
    )

    # Set up trainer
    checkpoint = ModelCheckpoint(
        dirpath='checkpoints',
        filename=params.prefix + '_{epoch:03d}',
        save_top_k=1 if params.save_top_only else -1,
        monitor='val_loss',
        verbose=True
    )
    if setup.cuda_is_available():
        trainer = Trainer(
            accelerator='ddp',
            gpus=setup.cuda_device_count(),
            callbacks=[checkpoint],
            max_epochs=params.epochs,
            precision=params.precision
        )
    else:
        trainer = Trainer(
            callbacks=[checkpoint],
            max_epochs=params.epochs,
            precision=params.precision
        )

    # Train
    trainer.fit(model)