def run_train(self): cfg = self.cfg train_iterator = self.get_data_iterator( cfg.train.batch_size, True, shuffle=True, shuffle_seed=cfg.seed, offset=self.start_batch, rank=cfg.local_rank, ) max_iterations = train_iterator.get_max_iterations() logger.info(" Total iterations per epoch=%d", max_iterations) if max_iterations == 0: logger.warning("No data found for training.") return updates_per_epoch = ( train_iterator.max_iterations // cfg.train.gradient_accumulation_steps ) total_updates = updates_per_epoch * cfg.train.num_train_epochs logger.info(" Total updates=%d", total_updates) warmup_steps = cfg.train.warmup_steps if self.scheduler_state: # TODO: ideally we'd want to just call # scheduler.load_state_dict(self.scheduler_state) # but it doesn't work properly as of now logger.info("Loading scheduler state %s", self.scheduler_state) shift = int(self.scheduler_state["last_epoch"]) logger.info("Steps shift %d", shift) scheduler = get_schedule_linear( self.optimizer, warmup_steps, total_updates, steps_shift=shift, ) else: scheduler = get_schedule_linear( self.optimizer, warmup_steps, total_updates ) # eval_step = math.ceil(updates_per_epoch / cfg.train.eval_per_epoch) eval_step = cfg.train.eval_every_epoch logger.info(f" Eval step = {eval_step}") logger.info("***** Training *****") for epoch in range(self.start_epoch, int(cfg.train.num_train_epochs)): logger.info("***** Epoch %d *****", epoch) self._train_epoch(scheduler, epoch, eval_step, train_iterator) if cfg.local_rank in [-1, 0]: logger.info( "Training finished. Best validation checkpoint %s", self.best_cp_name )
def run_train(self): cfg = self.cfg train_iterator = self.get_data_iterator( cfg.train_files, cfg.train.batch_size, True, shuffle=True, shuffle_seed=cfg.seed, offset=self.start_batch, ) # num_train_epochs = cfg.train.num_train_epochs - self.start_epoch logger.info("Total iterations per epoch=%d", train_iterator.max_iterations) updates_per_epoch = ( train_iterator.max_iterations // cfg.train.gradient_accumulation_steps ) total_updates = updates_per_epoch * cfg.train.num_train_epochs logger.info(" Total updates=%d", total_updates) warmup_steps = cfg.train.warmup_steps if self.scheduler_state: logger.info("Loading scheduler state %s", self.scheduler_state) shift = int(self.scheduler_state["last_epoch"]) logger.info("Steps shift %d", shift) scheduler = get_schedule_linear( self.optimizer, warmup_steps, total_updates, ) else: scheduler = get_schedule_linear(self.optimizer, warmup_steps, total_updates) eval_step = cfg.train.eval_step logger.info(" Eval step = %d", eval_step) logger.info("***** Training *****") global_step = self.start_epoch * updates_per_epoch + self.start_batch for epoch in range(self.start_epoch, cfg.train.num_train_epochs): logger.info("***** Epoch %d *****", epoch) global_step = self._train_epoch( scheduler, epoch, eval_step, train_iterator, global_step ) if cfg.local_rank in [-1, 0]: logger.info( "Training finished. Best validation checkpoint %s", self.best_cp_name ) return
def run_train(self, ): args = self.args upsample_rates = None if args.train_files_upsample_rates is not None: upsample_rates = eval(args.train_files_upsample_rates) train_iterator = self.get_data_iterator(args.train_file, args.batch_size, shuffle=True, shuffle_seed=args.seed, offset=self.start_batch, upsample_rates=upsample_rates) logger.info(" Total iterations per epoch=%d", train_iterator.max_iterations) updates_per_epoch = train_iterator.max_iterations // args.gradient_accumulation_steps total_updates = max(updates_per_epoch * (args.num_train_epochs - self.start_epoch - 1), 0) + \ (train_iterator.max_iterations - self.start_batch) // args.gradient_accumulation_steps logger.info(" Total updates=%d", total_updates) warmup_steps = args.warmup_steps scheduler = get_schedule_linear(self.optimizer, warmup_steps, total_updates) if self.scheduler_state: logger.info("Loading scheduler state %s", self.scheduler_state) scheduler.load_state_dict(self.scheduler_state) eval_step = math.ceil(updates_per_epoch / args.eval_per_epoch) logger.info(" Eval step = %d", eval_step) logger.info("***** Training *****") for epoch in range(self.start_epoch, int(args.num_train_epochs)): logger.info("***** Epoch %d *****", epoch) self._train_epoch(scheduler, epoch, eval_step, train_iterator) if args.local_rank in [-1, 0]: logger.info('Training finished. Best validation checkpoint %s', self.best_cp_name)
def run_train(self): args = self.args train_iterator = self.get_data_iterator( args.train_file, args.batch_size, True, shuffle=True, shuffle_seed=args.seed, offset=self.start_batch, ) num_train_epochs = args.num_train_epochs - self.start_epoch logger.info("Total iterations per epoch=%d", train_iterator.max_iterations) updates_per_epoch = ( train_iterator.max_iterations // args.gradient_accumulation_steps ) total_updates = updates_per_epoch * num_train_epochs - self.start_batch logger.info(" Total updates=%d", total_updates) warmup_steps = args.warmup_steps scheduler = get_schedule_linear( self.optimizer, warmup_steps=warmup_steps, training_steps=total_updates ) if self.scheduler_state: logger.info("Loading scheduler state %s", self.scheduler_state) scheduler.load_state_dict(self.scheduler_state) eval_step = args.eval_step logger.info(" Eval step = %d", eval_step) logger.info("***** Training *****") global_step = self.start_epoch * updates_per_epoch + self.start_batch for epoch in range(self.start_epoch, int(args.num_train_epochs)): logger.info("***** Epoch %d *****", epoch) global_step = self._train_epoch( scheduler, epoch, eval_step, train_iterator, global_step ) if args.local_rank in [-1, 0]: logger.info( "Training finished. Best validation checkpoint %s", self.best_cp_name ) return
updates_per_epoch = ( train_iterator.max_iterations // args.gradient_accumulation_steps ) total_updates = ( max(updates_per_epoch * (args.num_train_epochs - self.start_epoch - 1), 0) + (train_iterator.max_iterations - self.start_batch) // args.gradient_accumulation_steps ) logger.info(" Total updates=%d", total_updates) warmup_steps = args.warmup_steps total_updates = 0 TODO, there is a bug which is fixed in master where the LR scheduler goes 0 and we keep having trouble with this print('\n\n\n\nYOOOOOOOOOOOOOOO\n\n\n\nnot using LR') scheduler = get_schedule_linear(self.optimizer, warmup_steps, total_updates) if self.scheduler_state: logger.info("Loading scheduler state %s", self.scheduler_state) print('\n\n\n\nYOOOOOOOOOOOOOOO\n\n\n\nnot using LR') self.scheduler_state['_last_lr'] = self.scheduler_state['base_lrs'] scheduler.load_state_dict(self.scheduler_state) eval_step = math.ceil(updates_per_epoch / args.eval_per_epoch) logger.info(" Eval step = %d", eval_step) logger.info("***** Training *****") for epoch in range(self.start_epoch, int(args.num_train_epochs)): logger.info("***** Epoch %d *****", epoch) self._train_epoch(scheduler, epoch, eval_step, train_iterator)