def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer): lr_per_mb = [1.0]*30 + [0.1]*30 + [0.01]*20 + [0.001] l2_reg_weight = 0.0001 # adjust LR with minibatch size if minibatch_size != 256: for i in range(0, len(lr_per_mb)): lr_per_mb[i] *= minibatch_size / 256 # Set learning parameters lr_schedule = learning_rate_schedule(lr_per_mb, epoch_size=epoch_size, unit=UnitType.minibatch) mm_schedule = momentum_schedule(0.9) local_learner = nesterov(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) # learner object if block_size != None and num_quantization_bits != 32: raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.") if block_size != None: learner = block_momentum_distributed_learner(local_learner, block_size=block_size) else: learner = data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up) return Trainer(network['output'], (network['ce'], network['errs']), learner, progress_printer)
def create_trainer(self): try: p = self.output.parameters # Three of four parameters are learned by block_momentum_distributed_learner. bmd_learner = cntk.block_momentum_distributed_learner( cntk.momentum_sgd( [p[0], p[1], p[2]], cntk.learning_parameter_schedule(0.0001), cntk.momentum_as_time_constant_schedule(1000)), block_size=1000, block_learning_rate=0.01, block_momentum_as_time_constant=1000) # New API to mark which learner is to use for metric aggregaion. bmd_learner.set_as_metric_aggregator() # The last parameter is learned by the data_parallel_distributed_learner. momentum_schedule = cntk.momentum_schedule_per_sample( 0.9990913221888589) lr_per_sample = cntk.learning_parameter_schedule_per_sample(0.007) dpd_learner = cntk.data_parallel_distributed_learner( cntk.momentum_sgd([p[3]], lr_per_sample, momentum_schedule, True)) comm_rank = cntk.distributed.Communicator.rank() self.trainer = cntk.Trainer( self.output, (self.ce, self.err), [bmd_learner, dpd_learner], [ cntk.logging.ProgressPrinter( freq=progress_freq, tag="Training", rank=comm_rank) ]) except RuntimeError: self.trainer = None return
def create_distributed_learner(self, mode, config): local_learner = C.sgd(self.z.parameters, C.learning_parameter_schedule_per_sample(0.01)) try: if mode == 'data_parallel': if config is None: config = DataParallelConfig(num_quantization_bits=32, distributed_after=0) learner = C.data_parallel_distributed_learner( local_learner, num_quantization_bits=config.num_quantization_bits, distributed_after=config.distributed_after) elif mode == 'block_momentum': if config is None: # the default config to match data parallel SGD config = BlockMomentumConfig( block_momentum_as_time_constant=0, block_learning_rate=1, block_size=NUM_WORKERS, distributed_after=0) learner = C.block_momentum_distributed_learner( local_learner, block_momentum_as_time_constant=config. block_momentum_as_time_constant, block_learning_rate=config.block_learning_rate, block_size=config.block_size, distributed_after=config.distributed_after) else: learner = local_learner except RuntimeError: learner = None return learner
def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer): if network['name'] == 'resnet20': lr_per_mb = [1.0]*80+[0.1]*40+[0.01] elif network['name'] == 'resnet110': lr_per_mb = [0.1]*1+[1.0]*80+[0.1]*40+[0.01] else: return RuntimeError("Unknown model name!") momentum_time_constant = -minibatch_size/np.log(0.9) l2_reg_weight = 0.0001 # Set learning parameters lr_per_sample = [lr/minibatch_size for lr in lr_per_mb] lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample) mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant) # learner object if block_size != None and num_quantization_bits != 32: raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.") local_learner = momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight = l2_reg_weight) if block_size != None: learner = block_momentum_distributed_learner(local_learner, block_size=block_size) else: learner = data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up) return Trainer(network['output'], (network['ce'], network['pe']), learner, progress_printer)
def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer): lr_per_mb = [0.1] # [1.0]*30 + [0.1]*30 + [0.01]*20 + [0.001] l2_reg_weight = 0.0001 # adjust LR with minibatch size #if minibatch_size != 256: # for i in range(0, len(lr_per_mb)): # lr_per_mb[i] *= minibatch_size / 256 # Set learning parameters lr_schedule = learning_rate_schedule(lr_per_mb, epoch_size=epoch_size, unit=UnitType.minibatch) mm_schedule = momentum_schedule(0.9) local_learner = nesterov(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) # learner object if block_size != None and num_quantization_bits != 32: raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.") if block_size != None: learner = block_momentum_distributed_learner(local_learner, block_size=block_size) else: learner = data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up) return Trainer(network['output'], (network['ce'], network['errs']), learner, progress_printer)
def create_trainer(self): try: learner = cntk.block_momentum_distributed_learner(cntk.momentum_sgd(self.output.parameters, cntk.learning_parameter_schedule(0.0001), cntk.momentum_as_time_constant_schedule(1000)), block_size=1000, block_learning_rate=0.01, block_momentum_as_time_constant=1000) comm_rank = cntk.distributed.Communicator.rank() self.trainer = cntk.Trainer(self.output, (self.ce, self.err), [learner], [cntk.logging.ProgressPrinter(freq=progress_freq, tag="Training", rank=comm_rank)]) except RuntimeError: self.trainer = None return
def create_trainer(self): learner = cntk.block_momentum_distributed_learner( cntk.momentum_sgd(self.output.parameters, cntk.learning_parameter_schedule(0.0001), cntk.momentum_as_time_constant_schedule(1000)), block_size=1000, block_learning_rate=0.01, block_momentum_as_time_constant=1000) comm_rank = cntk.distributed.Communicator.rank() self.trainer = cntk.Trainer( self.output, (self.ce, self.err), [learner], [ cntk.logging.ProgressPrinter( freq=progress_freq, tag="Training", rank=comm_rank) ])
def create_distributed_learner(self, mode, config): local_learner = C.sgd(self.z.parameters, C.learning_parameter_schedule_per_sample(0.01)) try: if mode == 'data_parallel': if config is None: config = DataParallelConfig(num_quantization_bits=32, distributed_after=0) learner = C.data_parallel_distributed_learner(local_learner, num_quantization_bits=config.num_quantization_bits, distributed_after=config.distributed_after) elif mode == 'block_momentum': if config is None: # the default config to match data parallel SGD config = BlockMomentumConfig(block_momentum_as_time_constant=0, block_learning_rate=1, block_size=NUM_WORKERS, distributed_after=0) learner = C.block_momentum_distributed_learner(local_learner, block_momentum_as_time_constant=config.block_momentum_as_time_constant, block_learning_rate=config.block_learning_rate, block_size=config.block_size, distributed_after=config.distributed_after) else: learner = local_learner except RuntimeError: learner = None return learner