def train_process(q, device_id, epoch_size, device_num, enable_hccl): os.system("mkdir " + str(device_id)) os.chdir(str(device_id)) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(device_id=device_id) os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH os.environ['RANK_ID'] = str(device_id) os.environ['RANK_SIZE'] = str(device_num) if enable_hccl: context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, parameter_broadcast=True) auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160]) init() # network net = resnet50(class_num=config.class_num) # evaluation network dist_eval_network = ClassifyCorrectCell(net) if not config.use_label_smooth: config.label_smooth_factor = 0.0 # loss loss = nn.SoftmaxCrossEntropyWithLogits( sparse=True, reduction="mean", smooth_factor=config.label_smooth_factor, num_classes=config.class_num) # train dataset dataset = create_dataset(dataset_path=dataset_path, do_train=True, repeat_num=epoch_size, batch_size=config.batch_size) step_size = dataset.get_dataset_size() eval_interval = config.eval_interval dataset.__loop_size__ = step_size * eval_interval # evalutation dataset eval_dataset = create_dataset(dataset_path=eval_path, do_train=False, repeat_num=epoch_size, batch_size=config.eval_batch_size) # loss scale loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) # learning rate lr = Tensor( get_learning_rate(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode)) # optimizer decayed_params = list( filter( lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name, net.trainable_params())) no_decayed_params = [ param for param in net.trainable_params() if param not in decayed_params ] group_params = [{ 'params': decayed_params, 'weight_decay': config.weight_decay }, { 'params': no_decayed_params, 'weight_decay': 0.0 }, { 'order_params': net.trainable_params() }] if config.use_lars: momentum = nn.Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale, use_nesterov=config.use_nesterov) opt = nn.LARS(momentum, epsilon=config.lars_epsilon, coefficient=config.lars_coefficient, lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name) else: opt = nn.Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale, use_nesterov=config.use_nesterov) # model model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False, metrics={ 'acc': DistAccuracy(batch_size=config.eval_batch_size, device_num=device_num) }, eval_network=dist_eval_network) # model init print("init_start", device_id) model.init(dataset, eval_dataset) print("init_stop", device_id) # callbacks loss_cb = LossGet(1, step_size) # train and eval print("run_start", device_id) acc = 0.0 time_cost = 0.0 for epoch_idx in range(0, int(epoch_size / eval_interval)): model.train(1, dataset, callbacks=loss_cb) eval_start = time.time() output = model.eval(eval_dataset) eval_cost = (time.time() - eval_start) * 1000 acc = float(output["acc"]) time_cost = loss_cb.get_per_step_time() loss = loss_cb.get_loss() print( "the {} epoch's resnet result:\n " "device{}, training loss {}, acc {}, " "training per step cost {:.2f} ms, eval cost {:.2f} ms, total_cost {:.2f} ms" .format(epoch_idx, device_id, loss, acc, time_cost, eval_cost, time_cost * step_size + eval_cost)) q.put({'acc': acc, 'cost': time_cost})
def train_process_thor(q, device_id, epoch_size, device_num, enable_hccl): os.system("mkdir " + str(device_id)) os.chdir(str(device_id)) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(device_id=device_id) os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH_2 os.environ['RANK_ID'] = str(device_id - 4) os.environ['RANK_SIZE'] = str(device_num) if enable_hccl: context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, parameter_broadcast=True) auto_parallel_context().set_all_reduce_fusion_split_indices( [107], "hccl_world_groupsum1") auto_parallel_context().set_all_reduce_fusion_split_indices( [27], "hccl_world_groupsum2") auto_parallel_context().set_all_reduce_fusion_split_indices( [27], "hccl_world_groupsum3") auto_parallel_context().set_all_reduce_fusion_split_indices( [27], "hccl_world_groupsum4") auto_parallel_context().set_all_reduce_fusion_split_indices( [27], "hccl_world_groupsum5") init() # network damping = get_model_damping(0, 0.03, 0.87, 50, 5004) net = resnet50_thor(class_num=thor_config.class_num, damping=damping, loss_scale=thor_config.loss_scale, frequency=thor_config.frequency) # evaluation network dist_eval_network = ClassifyCorrectCell(net) if not thor_config.label_smooth: thor_config.label_smooth_factor = 0.0 # loss loss = nn.SoftmaxCrossEntropyWithLogits( sparse=True, reduction="mean", smooth_factor=thor_config.label_smooth_factor, num_classes=thor_config.class_num) # train dataset dataset = create_dataset(dataset_path=dataset_path, do_train=True, repeat_num=epoch_size, batch_size=thor_config.batch_size) step_size = dataset.get_dataset_size() eval_interval = thor_config.eval_interval # evalutation dataset eval_dataset = create_dataset(dataset_path=eval_path, do_train=False, repeat_num=epoch_size, batch_size=thor_config.eval_batch_size) # loss scale loss_scale = FixedLossScaleManager(thor_config.loss_scale, drop_overflow_update=False) # learning rate lr = Tensor(get_model_lr(0, 0.045, 6, 70, 5004)) # optimizer opt = THOR(filter(lambda x: x.requires_grad, net.get_parameters()), lr, thor_config.momentum, filter(lambda x: 'matrix_A' in x.name, net.get_parameters()), filter(lambda x: 'matrix_G' in x.name, net.get_parameters()), filter(lambda x: 'A_inv_max' in x.name, net.get_parameters()), filter(lambda x: 'G_inv_max' in x.name, net.get_parameters()), thor_config.weight_decay, thor_config.loss_scale) # model model = THOR_Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False, metrics={ 'acc': DistAccuracy(batch_size=thor_config.eval_batch_size, device_num=device_num) }, eval_network=dist_eval_network, frequency=thor_config.frequency) # model init print("init_start", device_id) model.init(dataset, eval_dataset) print("init_stop", device_id) # callbacks loss_cb = LossGet(1, step_size) # train and eval acc = 0.0 time_cost = 0.0 print("run_start", device_id) for epoch_idx in range(0, int(epoch_size / eval_interval)): model.train(eval_interval, dataset, callbacks=loss_cb) eval_start = time.time() output = model.eval(eval_dataset) eval_cost = (time.time() - eval_start) * 1000 acc = float(output["acc"]) time_cost = loss_cb.get_per_step_time() loss = loss_cb.get_loss() print( "the {} epoch's resnet result:\n " "device{}, training loss {}, acc {}, " "training per step cost {:.2f} ms, eval cost {:.2f} ms, total_cost {:.2f} ms" .format(epoch_idx, device_id, loss, acc, time_cost, eval_cost, time_cost * step_size + eval_cost)) q.put({'acc': acc, 'cost': time_cost})
def train_process_thor(q, device_id, epoch_size, device_num, enable_hccl): os.system("mkdir " + str(device_id)) os.chdir(str(device_id)) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(device_id=device_id) os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH_2 os.environ['RANK_ID'] = str(device_id - 4) os.environ['RANK_SIZE'] = str(device_num) if enable_hccl: context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[85, 160]) init() # network net = resnet50_thor(thor_config.class_num) if not thor_config.label_smooth: thor_config.label_smooth_factor = 0.0 # loss loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=thor_config.label_smooth_factor, num_classes=thor_config.class_num) # train dataset dataset = create_dataset_thor(dataset_path=dataset_path, do_train=True, repeat_num=1, batch_size=thor_config.batch_size) step_size = dataset.get_dataset_size() eval_interval = thor_config.eval_interval # evaluation dataset eval_dataset = create_dataset(dataset_path=eval_path, do_train=False, repeat_num=1, batch_size=thor_config.eval_batch_size) # loss scale loss_scale = FixedLossScaleManager(thor_config.loss_scale, drop_overflow_update=False) # learning rate lr = get_thor_lr(0, 0.05803, 4.04839, 53, 5004, decay_epochs=39) damping = get_thor_damping(0, 0.02714, 0.50036, 70, 5004) # optimizer split_indices = [26, 53] opt = THOR(net, Tensor(lr), Tensor(damping), thor_config.momentum, thor_config.weight_decay, thor_config.loss_scale, thor_config.batch_size, split_indices=split_indices) # evaluation network dist_eval_network = ClassifyCorrectCell(net) # model model = THOR_Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False, metrics={ 'acc': DistAccuracy(batch_size=thor_config.eval_batch_size, device_num=device_num) }, eval_network=dist_eval_network, frequency=thor_config.frequency) # model init print("init_start", device_id) model.init(dataset, eval_dataset) print("init_stop", device_id) # callbacks loss_cb = LossGet(1, step_size) # train and eval acc = 0.0 time_cost = 0.0 print("run_start", device_id) for epoch_idx in range(0, int(epoch_size / eval_interval)): model.train(eval_interval, dataset, callbacks=loss_cb) eval_start = time.time() output = model.eval(eval_dataset) eval_cost = (time.time() - eval_start) * 1000 acc = float(output["acc"]) time_cost = loss_cb.get_per_step_time() loss = loss_cb.get_loss() print( "the {} epoch's resnet result:\n " "device{}, training loss {}, acc {}, " "training per step cost {:.2f} ms, eval cost {:.2f} ms, total_cost {:.2f} ms" .format(epoch_idx, device_id, loss, acc, time_cost, eval_cost, time_cost * step_size + eval_cost)) q.put({'acc': acc, 'cost': time_cost})