示例#1
0
def train():
    rank_id = 0
    if args.run_distribute:
        context.set_auto_parallel_context(device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True)
        init()
        rank_id = get_rank()

    # dataset/network/criterion/optim
    ds = train_dataset_creator(rank_id, args.device_num)
    step_size = ds.get_dataset_size()
    print('Create dataset done!')

    config.INFERENCE = False
    net = ETSNet(config)
    net = net.set_train()
    param_dict = load_checkpoint(args.pre_trained)
    load_param_into_net(net, param_dict)
    print('Load Pretrained parameters done!')

    criterion = DiceLoss(batch_size=config.TRAIN_BATCH_SIZE)

    lrs = dynamic_lr(config.BASE_LR, config.TRAIN_TOTAL_ITER, config.WARMUP_STEP, config.WARMUP_RATIO)
    opt = nn.SGD(params=net.trainable_params(), learning_rate=lrs, momentum=0.99, weight_decay=5e-4)

    # warp model
    net = WithLossCell(net, criterion)
    if args.run_distribute:
        net = TrainOneStepCell(net, opt, reduce_flag=True, mean=True, degree=args.device_num)
    else:
        net = TrainOneStepCell(net, opt)

    time_cb = TimeMonitor(data_size=step_size)
    loss_cb = LossCallBack(per_print_times=10)
    # set and apply parameters of check point config.TRAIN_MODEL_SAVE_PATH
    ckpoint_cf = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=2)
    ckpoint_cb = ModelCheckpoint(prefix="ETSNet", config=ckpoint_cf,
                                 directory="./ckpt_{}".format(rank_id))

    model = Model(net)
    model.train(config.TRAIN_REPEAT_NUM, ds, dataset_sink_mode=True, callbacks=[time_cb, loss_cb, ckpoint_cb])
def test_pynative_resnet50():
    context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend")

    batch_size = 32
    num_classes = 10
    loss_scale = 128
    total_step = 50
    net = resnet50(batch_size, num_classes)
    optimizer = Momentum(learning_rate=0.01,
                         momentum=0.9,
                         params=filter(lambda x: x.requires_grad,
                                       net.get_parameters()))
    data_set = create_dataset(repeat_num=1,
                              training=True,
                              batch_size=batch_size,
                              num_samples=total_step * batch_size)

    # define callbacks
    time_cb = MyTimeMonitor(data_size=data_set.get_dataset_size())
    loss_cb = LossMonitor()
    cb = [time_cb, loss_cb]

    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    loss_scale = FixedLossScaleManager(loss_scale=loss_scale,
                                       drop_overflow_update=False)
    model = Model(net,
                  loss_fn=loss,
                  optimizer=optimizer,
                  loss_scale_manager=loss_scale,
                  metrics={'acc'},
                  amp_level="O2",
                  keep_batchnorm_fp32=False)

    # train model
    model.train(1,
                data_set,
                callbacks=cb,
                sink_size=data_set.get_dataset_size(),
                dataset_sink_mode=True)

    assert time_cb.good_step() > 10
示例#3
0
def do_eval(dataset=None, network=None, use_crf="", num_class=41, assessment_method="accuracy", data_file="",
            load_checkpoint_path="", vocab_file="", label_file="", tag_to_index=None, batch_size=1):
    """ do eval """
    if load_checkpoint_path == "":
        raise ValueError("Finetune model missed, evaluation task must load finetune model!")
    net_for_pretraining = network(bert_net_cfg, batch_size, False, num_class,
                                  use_crf=(use_crf.lower() == "true"), tag_to_index=tag_to_index)
    net_for_pretraining.set_train(False)
    param_dict = load_checkpoint(load_checkpoint_path)
    load_param_into_net(net_for_pretraining, param_dict)
    model = Model(net_for_pretraining)

    if assessment_method == "clue_benchmark":
        from src.cluener_evaluation import submit
        submit(model=model, path=data_file, vocab_file=vocab_file, use_crf=use_crf,
               label_file=label_file, tag_to_index=tag_to_index)
    else:
        if assessment_method == "accuracy":
            callback = Accuracy()
        elif assessment_method == "f1":
            callback = F1((use_crf.lower() == "true"), num_class)
        elif assessment_method == "spanf1":
            callback = SpanF1((use_crf.lower() == "true"), tag_to_index)
        elif assessment_method == "mcc":
            callback = MCC()
        elif assessment_method == "spearman_correlation":
            callback = Spearman_Correlation()
        else:
            raise ValueError("Assessment method not supported, support: [accuracy, f1, mcc, spearman_correlation]")

        columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"]
        for data in dataset.create_dict_iterator(num_epochs=1):
            input_data = []
            for i in columns_list:
                input_data.append(data[i])
            input_ids, input_mask, token_type_id, label_ids = input_data
            logits = model.predict(input_ids, input_mask, token_type_id, label_ids)
            callback.update(logits, label_ids)
        print("==============================================================")
        eval_result_print(assessment_method, callback)
        print("==============================================================")
示例#4
0
def test_optimizer_cpu():
    context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU")
    network = Net()
    lr = 0.01
    momentum = 0.9
    micro_batches = 2
    loss = nn.SoftmaxCrossEntropyWithLogits()
    factory = DPOptimizerClassFactory(micro_batches)
    factory.set_mechanisms('Gaussian', norm_bound=1.5, initial_noise_multiplier=5.0)
    net_opt = factory.create('SGD')(params=network.trainable_params(), learning_rate=lr,
                                    momentum=momentum)
    _ = Model(network, loss_fn=loss, optimizer=net_opt, metrics=None)
示例#5
0
def get_tensor_from_training(
        indices,
        ckpt_file="/tmp/pycharm_project_589/summary_dir-202010191622/weights/-1_350.ckpt",
        node_name="conv1.weight",
        data_type="gradient"):
    context.set_context(reserve_class_name_in_scope=False)
    net = resnet50(batch_size, num_classes)
    load_checkpoint(ckpt_file, net=net)
    ls = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   0.01, 0.9)
    model = Model(net, loss_fn=ls, optimizer=opt, metrics={'acc'})
    dataset = create_dataset(indices)

    data_inception_callback = DataInterceptionCallback(node_name=node_name,
                                                       data_type=data_type)
    model.train(1,
                dataset,
                callbacks=[LossMonitor(), data_inception_callback],
                dataset_sink_mode=False)
    return data_inception_callback.result
def bn_common(parallel_mode, train_flag, strategy_loss=None):
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                      device_num=8)
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
    rank_size = 8

    predict = Tensor(np.ones([32, 512]), dtype=ms.float32)
    label = Tensor(np.ones([32]), dtype=ms.int32)
    dataset = Dataset(predict, label, 2)
    net = bn_net()

    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    loss.softmax_cross_entropy.shard(strategy_loss)
    opt = Momentum(net.trainable_params(), learning_rate, momentum, 0.0001,
                   1024 * rank_size)

    if not train_flag:
        net = WithLossCell(net, loss)
        net.set_train()

    if parallel_mode == ParallelMode.DATA_PARALLEL:
        context.set_auto_parallel_context(parameter_broadcast=True)
    model = Model(net, loss, opt)
    if train_flag:
        model.train(epoch_size, dataset, dataset_sink_mode=False)
    else:
        model._predict(predict, label)
示例#7
0
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path=""):
    """ do train """
    if load_checkpoint_path == "":
        raise ValueError("Pretrain model missed, finetune task must load pretrain model!")
    steps_per_epoch = dataset.get_dataset_size()
    epoch_num = dataset.get_repeat_count()
    # optimizer
    if optimizer_cfg.optimizer == 'AdamWeightDecayDynamicLR':
        optimizer = AdamWeightDecayDynamicLR(network.trainable_params(),
                                             decay_steps=steps_per_epoch * epoch_num,
                                             learning_rate=optimizer_cfg.AdamWeightDecayDynamicLR.learning_rate,
                                             end_learning_rate=optimizer_cfg.AdamWeightDecayDynamicLR.end_learning_rate,
                                             power=optimizer_cfg.AdamWeightDecayDynamicLR.power,
                                             warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
                                             weight_decay=optimizer_cfg.AdamWeightDecayDynamicLR.weight_decay,
                                             eps=optimizer_cfg.AdamWeightDecayDynamicLR.eps)
    elif optimizer_cfg.optimizer == 'Lamb':
        optimizer = Lamb(network.trainable_params(), decay_steps=steps_per_epoch * epoch_num,
                         start_learning_rate=optimizer_cfg.Lamb.start_learning_rate,
                         end_learning_rate=optimizer_cfg.Lamb.end_learning_rate,
                         power=optimizer_cfg.Lamb.power, weight_decay=optimizer_cfg.Lamb.weight_decay,
                         warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
                         decay_filter=optimizer_cfg.Lamb.decay_filter)
    elif optimizer_cfg.optimizer == 'Momentum':
        optimizer = Momentum(network.trainable_params(), learning_rate=optimizer_cfg.Momentum.learning_rate,
                             momentum=optimizer_cfg.Momentum.momentum)
    else:
        raise Exception("Optimizer not supported. support: [AdamWeightDecayDynamicLR, Lamb, Momentum]")

    # load checkpoint into network
    ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix="classifier", directory=save_checkpoint_path, config=ckpt_config)
    param_dict = load_checkpoint(load_checkpoint_path)
    load_param_into_net(network, param_dict)

    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
    netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell)
    model = Model(netwithgrads)
    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb]
    model.train(epoch_num, dataset, callbacks=callbacks)
def test_sit_auto_mix_precision_model_o0():
    input_data = np.random.randn(32, 3, 224, 224).astype(np.float32)
    dataset1 = FakeData(size=32,
                        batch_size=32,
                        image_size=(3, 224, 224),
                        num_classes=10,
                        fakedata_mode=FakeDataInitMode.OnesInit)
    dataset1.set_label_data_type(np.float16)
    # graph mode
    context.set_context(mode=context.GRAPH_MODE)
    context.set_context(save_graphs=True, save_graphs_path='./test_amp_o0')
    net = Net(3, 10)
    net.to_float(dtype.float16)
    opt = nn.Momentum(params=net.trainable_params(),
                      learning_rate=0.001,
                      momentum=0.0009)
    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=False)
    model = Model(net, loss, opt, amp_level="O0")
    model.train(1, dataset1, dataset_sink_mode=False)
    contend = read_validateir_file('./test_amp_o0')
    castnum = re.findall("Cast", contend)
    assert len(castnum) == 17
    model.predict(Tensor(input_data))
    contend = read_validateir_file('./test_amp_o0')
    castnum = re.findall("Cast", contend)
    assert len(castnum) == 11
示例#9
0
文件: eval.py 项目: yrpang/mindspore
def main():
    """Main entrance for training"""
    args = parser.parse_args()
    print(sys.argv)

    #context.set_context(mode=context.GRAPH_MODE)
    context.set_context(mode=context.PYNATIVE_MODE)

    if args.GPU:
        context.set_context(device_target='GPU', device_id=args.device_id)

    # parse model argument
    assert args.model.startswith(
        "hournas"), "Only Tinynet models are supported."
    net = nasbenchnet()
    cfg = edict({
        'image_height': args.image_size,
        'image_width': args.image_size,
    })
    cfg.batch_size = args.batch_size
    val_data_url = args.data_path
    val_dataset = create_dataset_cifar10(val_data_url,
                                         repeat_num=1,
                                         training=False,
                                         cifar_cfg=cfg)
    loss = LabelSmoothingCrossEntropy(smooth_factor=args.smoothing,
                                      num_classes=args.num_classes)
    loss.add_flags_recursive(fp32=True, fp16=False)
    eval_metrics = {
        'Validation-Loss': Loss(),
        'Top1-Acc': Top1CategoricalAccuracy(),
        'Top5-Acc': Top5CategoricalAccuracy()
    }
    ckpt = load_checkpoint(args.ckpt)
    load_param_into_net(net, ckpt)
    net.set_train(False)
    model = Model(net, loss, metrics=eval_metrics)
    metrics = model.eval(val_dataset, dataset_sink_mode=False)
    print(metrics)
示例#10
0
def do_eval(dataset=None,
            network=None,
            use_crf="",
            num_class=41,
            assessment_method="accuracy",
            data_file="",
            load_checkpoint_path="",
            vocab_file="",
            label_file="",
            tag_to_index=None,
            batch_size=1):
    """ do eval """
    if load_checkpoint_path == "":
        raise ValueError(
            "Finetune model missed, evaluation task must load finetune model!")
    net_for_pretraining = network(ernie_net_cfg,
                                  batch_size,
                                  False,
                                  num_class,
                                  use_crf=(use_crf.lower() == "true"),
                                  tag_to_index=tag_to_index)
    net_for_pretraining.set_train(False)
    param_dict = load_checkpoint(load_checkpoint_path)
    load_param_into_net(net_for_pretraining, param_dict)
    model = Model(net_for_pretraining)

    callback = SpanF1((use_crf.lower() == "true"), tag_to_index)

    columns_list = ["input_ids", "input_mask", "token_type_id", "label_ids"]
    for data in dataset.create_dict_iterator(num_epochs=1):
        input_data = []
        for i in columns_list:
            input_data.append(data[i])
        input_ids, input_mask, token_type_id, label_ids = input_data
        logits = model.predict(input_ids, input_mask, token_type_id, label_ids)
        callback.update(logits, label_ids)
    print("==============================================================")
    eval_result_print(assessment_method, callback)
    print("==============================================================")
示例#11
0
def do_eval(dataset=None,
            network=None,
            num_class=2,
            assessment_method="accuracy",
            load_checkpoint_path=""):
    """ do eval """
    if load_checkpoint_path == "":
        raise ValueError(
            "Finetune model missed, evaluation task must load finetune model!")
    net_for_pretraining = network(bert_net_cfg, False, num_class)
    net_for_pretraining.set_train(False)
    param_dict = load_checkpoint(load_checkpoint_path)
    load_param_into_net(net_for_pretraining, param_dict)
    model = Model(net_for_pretraining)

    if assessment_method == "accuracy":
        callback = Accuracy()
    elif assessment_method == "f1":
        callback = F1(False, num_class)
    elif assessment_method == "mcc":
        callback = MCC()
    elif assessment_method == "spearman_correlation":
        callback = Spearman_Correlation()
    else:
        raise ValueError(
            "Assessment method not supported, support: [accuracy, f1, mcc, spearman_correlation]"
        )

    columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"]
    for data in dataset.create_dict_iterator():
        input_data = []
        for i in columns_list:
            input_data.append(Tensor(data[i]))
        input_ids, input_mask, token_type_id, label_ids = input_data
        logits = model.predict(input_ids, input_mask, token_type_id, label_ids)
        callback.update(logits, label_ids)
    print("==============================================================")
    eval_result_print(assessment_method, callback)
    print("==============================================================")
示例#12
0
def bert_predict():
    '''
    Predict function
    '''
    devid = int(os.getenv('DEVICE_ID'))
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid)
    dataset = get_enwiki_512_dataset(bert_net_cfg.batch_size, 1)
    net_for_pretraining = BertPretrainEva(bert_net_cfg)
    net_for_pretraining.set_train(False)
    param_dict = load_checkpoint(cfg.finetune_ckpt)
    load_param_into_net(net_for_pretraining, param_dict)
    model = Model(net_for_pretraining)
    return model, dataset, net_for_pretraining
示例#13
0
def do_eval(dataset=None, load_checkpoint_path="", eval_batch_size=1):
    """ do eval """
    if load_checkpoint_path == "":
        raise ValueError(
            "Finetune model missed, evaluation task must load finetune model!")
    net = BertSquad(bert_net_cfg, False, 2)
    net.set_train(False)
    param_dict = load_checkpoint(load_checkpoint_path)
    load_param_into_net(net, param_dict)
    model = Model(net)
    output = []
    RawResult = collections.namedtuple(
        "RawResult", ["unique_id", "start_logits", "end_logits"])
    columns_list = ["input_ids", "input_mask", "segment_ids", "unique_ids"]
    for data in dataset.create_dict_iterator(num_epochs=1):
        input_data = []
        for i in columns_list:
            input_data.append(data[i])
        input_ids, input_mask, segment_ids, unique_ids = input_data
        start_positions = Tensor([1], mstype.float32)
        end_positions = Tensor([1], mstype.float32)
        is_impossible = Tensor([1], mstype.float32)
        logits = model.predict(input_ids, input_mask, segment_ids,
                               start_positions, end_positions, unique_ids,
                               is_impossible)
        ids = logits[0].asnumpy()
        start = logits[1].asnumpy()
        end = logits[2].asnumpy()

        for i in range(eval_batch_size):
            unique_id = int(ids[i])
            start_logits = [float(x) for x in start[i].flat]
            end_logits = [float(x) for x in end[i].flat]
            output.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))
    return output
示例#14
0
def run_fasttext_infer():
    """run infer with FastText"""
    dataset = load_infer_dataset(batch_size=config.batch_size, datafile=args.data_path, bucket=config.test_buckets)
    fasttext_model = FastText(config.vocab_size, config.embedding_dims, config.num_class)

    parameter_dict = load_checkpoint(args.model_ckpt)
    load_param_into_net(fasttext_model, parameter_dict=parameter_dict)

    ft_infer = FastTextInferCell(fasttext_model)

    model = Model(ft_infer)

    predictions = []
    target_sens = []

    for batch in dataset.create_dict_iterator(output_numpy=True, num_epochs=1):
        target_sens.append(batch['label_idx'])
        src_tokens = Tensor(batch['src_tokens'], mstype.int32)
        src_tokens_length = Tensor(batch['src_tokens_length'], mstype.int32)
        predicted_idx = model.predict(src_tokens, src_tokens_length)
        predictions.append(predicted_idx.asnumpy())

    from sklearn.metrics import accuracy_score, classification_report
    target_sens = np.array(target_sens).flatten()
    merge_target_sens = []
    for target_sen in target_sens:
        merge_target_sens.extend(target_sen)
    target_sens = merge_target_sens
    predictions = np.array(predictions).flatten()
    merge_predictions = []
    for prediction in predictions:
        merge_predictions.extend(prediction)
    predictions = merge_predictions
    acc = accuracy_score(target_sens, predictions)

    result_report = classification_report(target_sens, predictions, target_names=target_label1)
    print("********Accuracy: ", acc)
    print(result_report)
示例#15
0
def eval_():
    # set args
    dev = "GPU"
    compute_type = str(args_opt.dtype).lower()
    ckpt_dir = str(args_opt.ckpt_path)
    total_batch = int(args_opt.batch_size)
    # init context
    if args_opt.mode == "GRAPH":
        mode = context.GRAPH_MODE
    else:
        mode = context.PYNATIVE_MODE
    context.set_context(mode=mode, device_target=dev, save_graphs=False)
    # create dataset
    dataset = create_dataset(dataset_path=args_opt.dataset_path,
                             do_train=False,
                             repeat_num=1,
                             batch_size=total_batch,
                             target=dev,
                             dtype=compute_type)
    # define net
    net = resnet(class_num=1001, dtype=compute_type)
    # load checkpoint
    param_dict = load_checkpoint(ckpt_dir)
    load_param_into_net(net, param_dict)
    net.set_train(False)
    # define loss, model
    loss = CrossEntropySmooth(sparse=True,
                              reduction='mean',
                              smooth_factor=0.1,
                              num_classes=1001)
    # define model
    model = Model(net,
                  loss_fn=loss,
                  metrics={'top_1_accuracy', 'top_5_accuracy'})
    # eval model
    print("========START EVAL RESNET50 ON GPU ========")
    res = model.eval(dataset)
    print("result:", res, "ckpt=", ckpt_dir)
示例#16
0
def train_process(q, device_id, epoch_size, num_classes, device_num,
                  batch_size, enable_hccl):
    os.system("mkdir " + str(device_id))
    os.chdir(str(device_id))
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        save_graphs=False)
    context.set_context(enable_task_sink=True, device_id=device_id)
    context.set_context(enable_loop_sink=True)
    context.set_context(enable_mem_reuse=True)
    context.set_context(enable_hccl=enable_hccl)
    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH
    os.environ['RANK_ID'] = str(device_id)
    os.environ['RANK_SIZE'] = str(device_num)
    if enable_hccl:
        context.set_auto_parallel_context(
            device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL)
        auto_parallel_context().set_all_reduce_fusion_split_indices([140])
        init()
    context.set_context(mode=context.GRAPH_MODE)
    net = resnet50(batch_size, num_classes)
    loss = CrossEntropyLoss()
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   0.01, 0.9)

    model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})

    dataset = create_dataset(epoch_size,
                             training=True,
                             batch_size=batch_size,
                             rank_id=device_id,
                             rank_size=device_num,
                             enable_hccl=enable_hccl)
    batch_num = dataset.get_dataset_size()
    loss_cb = LossGet()
    model.train(epoch_size, dataset, callbacks=[loss_cb])
    q.put(loss_cb.get_loss())
示例#17
0
    def epoch_end(self, run_context):
        """evaluate the model and ema-model at the end of each epoch"""
        cb_params = run_context.original_args()
        cur_epoch = cb_params.cur_epoch_num + self._start_epoch - 1

        save_ckpt = (cur_epoch % self.save_epoch == 0)
        load_nparray_into_net(self.ema_network, self.shadow)
        model = Model(self.network,
                      loss_fn=self.loss_fn,
                      metrics=self.eval_metrics)
        model_ema = Model(self.ema_network,
                          loss_fn=self.loss_fn,
                          metrics=self.eval_metrics)
        acc = model.eval(self.eval_dataset,
                         dataset_sink_mode=self.dataset_sink_mode)
        ema_acc = model_ema.eval(self.eval_dataset,
                                 dataset_sink_mode=self.dataset_sink_mode)
        print("Model Accuracy:", acc)
        print("EMA-Model Accuracy:", ema_acc)

        output = [{
            "name": k,
            "data": Tensor(v)
        } for k, v in self.shadow.items()]
        self.ema_accuracy[cur_epoch] = ema_acc["Top1-Acc"]
        if self.best_ema_accuracy < ema_acc["Top1-Acc"]:
            self.best_ema_accuracy = ema_acc["Top1-Acc"]
            self.best_ema_epoch = cur_epoch
            save_checkpoint(output, "ema_best.ckpt")

        if self.best_accuracy < acc["Top1-Acc"]:
            self.best_accuracy = acc["Top1-Acc"]
            self.best_epoch = cur_epoch

        print("Best Model Accuracy: %s, at epoch %s" %
              (self.best_accuracy, self.best_epoch))
        print("Best EMA-Model Accuracy: %s, at epoch %s" %
              (self.best_ema_accuracy, self.best_ema_epoch))

        if save_ckpt:
            # Save the ema_model checkpoints
            ckpt = "{}-{}.ckpt".format("ema", cur_epoch)
            save_checkpoint(output, ckpt)
            save_checkpoint(output, "ema_last.ckpt")

            # Save the model checkpoints
            save_checkpoint(cb_params.train_network, "last.ckpt")

        print("Top 10 EMA-Model Accuracies: ")
        count = 0
        for epoch in sorted(self.ema_accuracy,
                            key=self.ema_accuracy.get,
                            reverse=True):
            if count == 10:
                break
            print("epoch: %s, Top-1: %s)" % (epoch, self.ema_accuracy[epoch]))
            count += 1
示例#18
0
def bert_predict(Evaluation):
    '''
    prediction function
    '''
    devid = int(os.getenv('DEVICE_ID'))
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid)
    dataset = get_dataset(bert_net_cfg.batch_size, 1)
    if cfg.use_crf:
        net_for_pretraining = Evaluation(bert_net_cfg, False, num_labels=len(tag_to_index), use_crf=True,
                                         tag_to_index=tag_to_index, dropout_prob=0.0)
    else:
        net_for_pretraining = Evaluation(bert_net_cfg, False, num_labels)
    net_for_pretraining.set_train(False)
    param_dict = load_checkpoint(cfg.finetune_ckpt)
    load_param_into_net(net_for_pretraining, param_dict)
    model = Model(net_for_pretraining)
    return model, dataset
示例#19
0
def train(args_opt, config):
    if args_opt.run_distribute:
        init()
        context.set_auto_parallel_context(parallel_mode="data_parallel")

    ds = dataset_creator(args_opt.run_distribute)

    net = CNNCTC_Model(config.NUM_CLASS, config.HIDDEN_SIZE,
                       config.FINAL_FEATURE_WIDTH)
    net.set_train(True)

    if config.CKPT_PATH != '':
        param_dict = load_checkpoint(config.CKPT_PATH)
        load_param_into_net(net, param_dict)
        print('parameters loaded!')
    else:
        print('train from scratch...')

    criterion = ctc_loss()
    opt = mindspore.nn.RMSProp(params=net.trainable_params(),
                               centered=True,
                               learning_rate=config.LR_PARA,
                               momentum=config.MOMENTUM,
                               loss_scale=config.LOSS_SCALE)

    net = WithLossCell(net, criterion)
    loss_scale_manager = mindspore.train.loss_scale_manager.FixedLossScaleManager(
        config.LOSS_SCALE, False)
    model = Model(net,
                  optimizer=opt,
                  loss_scale_manager=loss_scale_manager,
                  amp_level="O2")

    callback = LossCallBack()
    config_ck = CheckpointConfig(
        save_checkpoint_steps=config.SAVE_CKPT_PER_N_STEP,
        keep_checkpoint_max=config.KEEP_CKPT_MAX_NUM)
    ckpoint_cb = ModelCheckpoint(prefix="CNNCTC",
                                 config=config_ck,
                                 directory=config.SAVE_PATH)

    if args_opt.device_id == 0:
        model.train(config.TRAIN_EPOCHS,
                    ds,
                    callbacks=[callback, ckpoint_cb],
                    dataset_sink_mode=False)
    else:
        model.train(config.TRAIN_EPOCHS,
                    ds,
                    callbacks=[callback],
                    dataset_sink_mode=False)
def bert_predict(Evaluation):
    '''
    prediction function
    '''
    dataset = get_dataset(bert_net_cfg.batch_size, 1)
    if cfg.use_crf:
        net_for_pretraining = Evaluation(bert_net_cfg,
                                         False,
                                         num_labels=len(tag_to_index),
                                         use_crf=True,
                                         tag_to_index=tag_to_index,
                                         dropout_prob=0.0)
    else:
        net_for_pretraining = Evaluation(bert_net_cfg, False, num_labels)
    net_for_pretraining.set_train(False)
    param_dict = load_checkpoint(cfg.finetune_ckpt)
    load_param_into_net(net_for_pretraining, param_dict)
    model = Model(net_for_pretraining)
    return model, dataset
示例#21
0
    epoch_size = config.epoch_size

    # get learning rate
    lr = Tensor(get_lr(global_step=0,
                       lr_init=config.lr_init,
                       lr_end=config.lr_end,
                       lr_max=config.lr_max,
                       warmup_epochs=config.warmup_epochs,
                       total_epochs=epoch_size,
                       steps_per_epoch=step_size))

    if args_opt.pretrain_ckpt == "" or args_opt.freeze_layer != "backbone":
        loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
        opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, \
            config.weight_decay, config.loss_scale)
        model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale)

        cb = config_ckpoint(config, lr, step_size)
        print("============== Starting Training ==============")
        model.train(epoch_size, dataset, callbacks=cb)
        print("============== End Training ==============")

    else:
        opt = Momentum(filter(lambda x: x.requires_grad, head_net.get_parameters()), lr, config.momentum, config.weight_decay)

        network = WithLossCell(head_net, loss)
        network = TrainOneStepCell(network, opt)
        network.set_train()

        features_path = args_opt.dataset_path + '_features'
        idx_list = list(range(step_size))
示例#22
0
def train(cloud_args=None):
    """training process"""
    args = parse_args(cloud_args)
    context.set_context(mode=context.GRAPH_MODE,
                        enable_auto_mixed_precision=True,
                        device_target=args.platform,
                        save_graphs=False)
    if os.getenv('DEVICE_ID', "not_set").isdigit():
        context.set_context(device_id=int(os.getenv('DEVICE_ID')))

    # init distributed
    if args.is_distributed:
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                          device_num=args.group_size,
                                          gradients_mean=True)
    # dataloader
    de_dataset = classification_dataset(args.data_dir,
                                        args.image_size,
                                        args.per_batch_size,
                                        1,
                                        args.rank,
                                        args.group_size,
                                        num_parallel_workers=8)
    de_dataset.map_model = 4  # !!!important
    args.steps_per_epoch = de_dataset.get_dataset_size()

    args.logger.save_args(args)

    # network
    args.logger.important_info('start create network')
    # get network and init
    network = get_network(args.backbone,
                          num_classes=args.num_classes,
                          platform=args.platform)
    if network is None:
        raise NotImplementedError('not implement {}'.format(args.backbone))

    load_pretrain_model(args.pretrained, network, args)

    # lr scheduler
    lr = get_lr(args)

    # optimizer
    opt = Momentum(params=get_param_groups(network),
                   learning_rate=Tensor(lr),
                   momentum=args.momentum,
                   weight_decay=args.weight_decay,
                   loss_scale=args.loss_scale)

    # loss
    if not args.label_smooth:
        args.label_smooth_factor = 0.0
    loss = CrossEntropy(smooth_factor=args.label_smooth_factor,
                        num_classes=args.num_classes)

    if args.is_dynamic_loss_scale == 1:
        loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536,
                                                     scale_factor=2,
                                                     scale_window=2000)
    else:
        loss_scale_manager = FixedLossScaleManager(args.loss_scale,
                                                   drop_overflow_update=False)

    model = Model(network,
                  loss_fn=loss,
                  optimizer=opt,
                  loss_scale_manager=loss_scale_manager,
                  metrics={'acc'},
                  amp_level="O3")

    # checkpoint save
    progress_cb = ProgressMonitor(args)
    callbacks = [
        progress_cb,
    ]
    if args.rank_save_ckpt_flag:
        ckpt_config = CheckpointConfig(
            save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch,
            keep_checkpoint_max=args.ckpt_save_max)
        save_ckpt_path = os.path.join(args.outputs_dir,
                                      'ckpt_' + str(args.rank) + '/')
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=save_ckpt_path,
                                  prefix='{}'.format(args.rank))
        callbacks.append(ckpt_cb)

    model.train(args.max_epoch,
                de_dataset,
                callbacks=callbacks,
                dataset_sink_mode=True)
示例#23
0
    elif args.lr_scheduler == 'step':
        lr = lr_steps(0, lr_init=args.lr_init, lr_max=args.lr_max, warmup_epochs=args.warmup_epochs,
                      total_epochs=args.max_epoch, steps_per_epoch=batch_num)
    else:
        raise NotImplementedError(args.lr_scheduler)

    # optimizer
    opt = Momentum(params=get_param_groups(network),
                   learning_rate=Tensor(lr),
                   momentum=args.momentum,
                   weight_decay=args.weight_decay,
                   loss_scale=args.loss_scale)

    if args.dataset == "cifar10":
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False)
        model = Model(network, loss_fn=loss, optimizer=opt, metrics={'acc'},
                      amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=None)
    else:
        if not args.label_smooth:
            args.label_smooth_factor = 0.0
        loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes)

        loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)
        model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, amp_level="O2")

    # checkpoint save
    progress_cb = ProgressMonitor(args)
    callbacks = [progress_cb,]
    if args.rank_save_ckpt_flag:
        ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch,
                                       keep_checkpoint_max=args.ckpt_save_max)
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
示例#24
0
                  lr_max=cfg.lr_init,
                  total_epochs=cfg.epoch_size,
                  steps_per_epoch=batch_num)
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   Tensor(lr),
                   cfg.momentum,
                   weight_decay=cfg.weight_decay)
    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True,
                                            reduction='mean',
                                            is_grad=False)

    if device_target == "Ascend":
        model = Model(net,
                      loss_fn=loss,
                      optimizer=opt,
                      metrics={'acc'},
                      amp_level="O2",
                      keep_batchnorm_fp32=False,
                      loss_scale_manager=None)
        ckpt_save_dir = "./"
    else:  # GPU
        model = Model(net,
                      loss_fn=loss,
                      optimizer=opt,
                      metrics={'acc'},
                      amp_level="O2",
                      keep_batchnorm_fp32=True,
                      loss_scale_manager=None)
        ckpt_save_dir = "./ckpt_" + str(get_rank()) + "/"

    config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 5,
示例#25
0
def test_train():
    '''
    finetune function
    '''
    target = args_opt.device_target
    if target == "Ascend":
        devid = int(os.getenv('DEVICE_ID'))
        context.set_context(mode=context.GRAPH_MODE,
                            device_target="Ascend",
                            device_id=devid)
    elif target == "GPU":
        context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
    else:
        raise Exception("Target error, GPU or Ascend is supported.")
    #BertCLSTrain for classification
    #BertNERTrain for sequence labeling
    if cfg.task == 'NER':
        if cfg.use_crf:
            netwithloss = BertNER(bert_net_cfg,
                                  True,
                                  num_labels=len(tag_to_index),
                                  use_crf=True,
                                  tag_to_index=tag_to_index,
                                  dropout_prob=0.1)
        else:
            netwithloss = BertNER(bert_net_cfg,
                                  True,
                                  num_labels=cfg.num_labels,
                                  dropout_prob=0.1)
    elif cfg.task == 'SQUAD':
        netwithloss = BertSquad(bert_net_cfg, True, 2, dropout_prob=0.1)
    else:
        netwithloss = BertCLS(bert_net_cfg,
                              True,
                              num_labels=cfg.num_labels,
                              dropout_prob=0.1)
    if cfg.task == 'SQUAD':
        dataset = get_squad_dataset(bert_net_cfg.batch_size, cfg.epoch_num)
    else:
        dataset = get_dataset(bert_net_cfg.batch_size, cfg.epoch_num)
    # optimizer
    steps_per_epoch = dataset.get_dataset_size()
    if cfg.optimizer == 'AdamWeightDecayDynamicLR':
        optimizer = AdamWeightDecayDynamicLR(
            netwithloss.trainable_params(),
            decay_steps=steps_per_epoch * cfg.epoch_num,
            learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate,
            end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate,
            power=cfg.AdamWeightDecayDynamicLR.power,
            warmup_steps=int(steps_per_epoch * cfg.epoch_num * 0.1),
            weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay,
            eps=cfg.AdamWeightDecayDynamicLR.eps)
    elif cfg.optimizer == 'Lamb':
        optimizer = Lamb(netwithloss.trainable_params(),
                         decay_steps=steps_per_epoch * cfg.epoch_num,
                         start_learning_rate=cfg.Lamb.start_learning_rate,
                         end_learning_rate=cfg.Lamb.end_learning_rate,
                         power=cfg.Lamb.power,
                         weight_decay=cfg.Lamb.weight_decay,
                         warmup_steps=int(steps_per_epoch * cfg.epoch_num *
                                          0.1),
                         decay_filter=cfg.Lamb.decay_filter)
    elif cfg.optimizer == 'Momentum':
        optimizer = Momentum(netwithloss.trainable_params(),
                             learning_rate=cfg.Momentum.learning_rate,
                             momentum=cfg.Momentum.momentum)
    else:
        raise Exception("Optimizer not supported.")
    # load checkpoint into network
    ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch,
                                   keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix=cfg.ckpt_prefix,
                                 directory=cfg.ckpt_dir,
                                 config=ckpt_config)
    param_dict = load_checkpoint(cfg.pre_training_ckpt)
    load_param_into_net(netwithloss, param_dict)

    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32,
                                             scale_factor=2,
                                             scale_window=1000)
    if cfg.task == 'SQUAD':
        netwithgrads = BertSquadCell(netwithloss,
                                     optimizer=optimizer,
                                     scale_update_cell=update_cell)
    else:
        netwithgrads = BertFinetuneCell(netwithloss,
                                        optimizer=optimizer,
                                        scale_update_cell=update_cell)
    model = Model(netwithgrads)
    model.train(cfg.epoch_num, dataset, callbacks=[LossCallBack(), ckpoint_cb])
示例#26
0
        loss = CrossEntropySmooth(sparse=True,
                                  reduction="mean",
                                  smooth_factor=cfg.label_smooth_factor,
                                  num_classes=cfg.num_classes)

        if cfg.is_dynamic_loss_scale:
            loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536,
                                                         scale_factor=2,
                                                         scale_window=2000)
        else:
            loss_scale_manager = FixedLossScaleManager(
                cfg.loss_scale, drop_overflow_update=False)

    model = Model(net,
                  loss_fn=loss,
                  optimizer=opt,
                  metrics={'acc'},
                  amp_level="O3",
                  loss_scale_manager=loss_scale_manager)

    config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 50,
                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
    time_cb = TimeMonitor(data_size=batch_num)
    ckpt_save_dir = "./ckpt_" + str(rank) + "/"
    ckpoint_cb = ModelCheckpoint(prefix="train_tinydarknet_" +
                                 args_opt.dataset_name,
                                 directory=ckpt_save_dir,
                                 config=config_ck)
    loss_cb = LossMonitor()
    model.train(cfg.epoch_size,
                dataset,
                callbacks=[time_cb, ckpoint_cb, loss_cb])
示例#27
0
def test_mobilenetv2_quant():
    set_seed(1)
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
    config = config_ascend_quant
    print("training configure: {}".format(config))

    epoch_size = config.epoch_size

    # define network
    network = mobilenetV2(num_classes=config.num_classes)
    # define loss
    if config.label_smooth > 0:
        loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth,
                                           num_classes=config.num_classes)
    else:
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    # define dataset
    dataset = create_dataset(dataset_path=dataset_path,
                             config=config,
                             repeat_num=1,
                             batch_size=config.batch_size)
    step_size = dataset.get_dataset_size()

    # convert fusion network to quantization aware network
    quantizer = QuantizationAwareTraining(bn_fold=True,
                                          per_channel=[True, False],
                                          symmetric=[True, False])
    network = quantizer.quantize(network)

    # get learning rate
    lr = Tensor(
        get_lr(global_step=config.start_epoch * step_size,
               lr_init=0,
               lr_end=0,
               lr_max=config.lr,
               warmup_epochs=config.warmup_epochs,
               total_epochs=epoch_size + config.start_epoch,
               steps_per_epoch=step_size))

    # define optimization
    opt = nn.Momentum(
        filter(lambda x: x.requires_grad, network.get_parameters()), lr,
        config.momentum, config.weight_decay)
    # define model
    model = Model(network, loss_fn=loss, optimizer=opt)

    print("============== Starting Training ==============")
    monitor = Monitor(lr_init=lr.asnumpy(),
                      step_threshold=config.step_threshold)
    callback = [monitor]
    model.train(epoch_size,
                dataset,
                callbacks=callback,
                dataset_sink_mode=False)
    print("============== End Training ==============")

    expect_avg_step_loss = 2.32
    avg_step_loss = np.mean(np.array(monitor.losses))

    print("average step loss:{}".format(avg_step_loss))
    assert avg_step_loss < expect_avg_step_loss
示例#28
0
def test_resnet50_quant():
    set_seed(1)
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
    config = config_quant
    print("training configure: {}".format(config))
    epoch_size = config.epoch_size

    # define network
    net = resnet50_quant(class_num=config.class_num)
    net.set_train(True)

    # define loss
    if not config.use_label_smooth:
        config.label_smooth_factor = 0.0
    loss = CrossEntropy(smooth_factor=config.label_smooth_factor,
                        num_classes=config.class_num)
    #loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)

    # define dataset
    dataset = create_dataset(dataset_path=dataset_path,
                             config=config,
                             repeat_num=1,
                             batch_size=config.batch_size)
    step_size = dataset.get_dataset_size()

    # convert fusion network to quantization aware network
    net = quant.convert_quant_network(net,
                                      bn_fold=True,
                                      per_channel=[True, False],
                                      symmetric=[True, False])

    # get learning rate
    lr = Tensor(
        get_lr(lr_init=config.lr_init,
               lr_end=0.0,
               lr_max=config.lr_max,
               warmup_epochs=config.warmup_epochs,
               total_epochs=config.epoch_size,
               steps_per_epoch=step_size,
               lr_decay_mode='cosine'))

    # define optimization
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr,
                   config.momentum, config.weight_decay, config.loss_scale)

    # define model
    #model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})
    model = Model(net, loss_fn=loss, optimizer=opt)

    print("============== Starting Training ==============")
    monitor = Monitor(lr_init=lr.asnumpy(),
                      step_threshold=config.step_threshold)

    callbacks = [monitor]
    model.train(epoch_size,
                dataset,
                callbacks=callbacks,
                dataset_sink_mode=False)
    print("============== End Training ==============")

    expect_avg_step_loss = 2.40
    avg_step_loss = np.mean(np.array(monitor.losses))

    print("average step loss:{}".format(avg_step_loss))
    assert avg_step_loss < expect_avg_step_loss
def run_general_distill():
    """
    run general distill
    """
    parser = argparse.ArgumentParser(description='tinybert general distill')
    parser.add_argument(
        '--device_target',
        type=str,
        default='Ascend',
        choices=['Ascend', 'GPU'],
        help='device where the code will be implemented. (Default: Ascend)')
    parser.add_argument("--distribute",
                        type=str,
                        default="false",
                        help="Run distribute, default is false.")
    parser.add_argument("--epoch_size",
                        type=int,
                        default="3",
                        help="Epoch size, default is 1.")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--device_num",
                        type=int,
                        default=1,
                        help="Use device nums, default is 1.")
    parser.add_argument("--save_ckpt_step",
                        type=int,
                        default=100,
                        help="Enable data sink, default is true.")
    parser.add_argument("--max_ckpt_num",
                        type=int,
                        default=1,
                        help="Enable data sink, default is true.")
    parser.add_argument("--do_shuffle",
                        type=str,
                        default="true",
                        help="Enable shuffle for dataset, default is true.")
    parser.add_argument("--enable_data_sink",
                        type=str,
                        default="true",
                        help="Enable data sink, default is true.")
    parser.add_argument("--data_sink_steps",
                        type=int,
                        default=1,
                        help="Sink steps for each epoch, default is 1.")
    parser.add_argument("--save_ckpt_path",
                        type=str,
                        default="",
                        help="Save checkpoint path")
    parser.add_argument("--load_teacher_ckpt_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--data_dir",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_dir",
                        type=str,
                        default="",
                        help="Schema path, it is better to use absolute path")
    args_opt = parser.parse_args()

    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args_opt.device_id)
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)
    context.set_context(variable_memory_max_size="30GB")

    save_ckpt_dir = os.path.join(
        args_opt.save_ckpt_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))

    if not os.path.exists(save_ckpt_dir):
        os.makedirs(save_ckpt_dir)

    if args_opt.distribute == "true":
        D.init('hccl')
        device_num = args_opt.device_num
        rank = args_opt.device_id % device_num
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            device_num=device_num)
    else:
        rank = 0
        device_num = 1

    netwithloss = BertNetworkWithLoss_gd(
        teacher_config=bert_teacher_net_cfg,
        teacher_ckpt=args_opt.load_teacher_ckpt_path,
        student_config=bert_student_net_cfg,
        is_training=True,
        use_one_hot_embeddings=False)

    dataset = create_tinybert_dataset('gd', bert_teacher_net_cfg.batch_size,
                                      device_num, rank, args_opt.do_shuffle,
                                      args_opt.data_dir, args_opt.schema_dir)

    dataset_size = dataset.get_dataset_size()
    print('dataset size: ', dataset_size)
    if args_opt.enable_data_sink == "true":
        repeat_count = args_opt.epoch_size * dataset.get_dataset_size(
        ) // args_opt.data_sink_steps
        time_monitor_steps = args_opt.data_sink_steps
    else:
        repeat_count = args_opt.epoch_size
        time_monitor_steps = dataset_size

    lr_schedule = BertLearningRate(
        learning_rate=common_cfg.AdamWeightDecay.learning_rate,
        end_learning_rate=common_cfg.AdamWeightDecay.end_learning_rate,
        warmup_steps=int(dataset_size * args_opt.epoch_size / 10),
        decay_steps=int(dataset_size * args_opt.epoch_size),
        power=common_cfg.AdamWeightDecay.power)
    params = netwithloss.trainable_params()
    decay_params = list(filter(common_cfg.AdamWeightDecay.decay_filter,
                               params))
    other_params = list(
        filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params))
    group_params = [{
        'params': decay_params,
        'weight_decay': common_cfg.AdamWeightDecay.weight_decay
    }, {
        'params': other_params,
        'weight_decay': 0.0
    }, {
        'order_params': params
    }]

    optimizer = AdamWeightDecay(group_params,
                                learning_rate=lr_schedule,
                                eps=common_cfg.AdamWeightDecay.eps)

    callback = [
        TimeMonitor(time_monitor_steps),
        LossCallBack(),
        ModelSaveCkpt(netwithloss.bert, args_opt.save_ckpt_step,
                      args_opt.max_ckpt_num, save_ckpt_dir)
    ]

    update_cell = DynamicLossScaleUpdateCell(
        loss_scale_value=common_cfg.loss_scale_value,
        scale_factor=common_cfg.scale_factor,
        scale_window=common_cfg.scale_window)

    netwithgrads = BertTrainWithLossScaleCell(netwithloss,
                                              optimizer=optimizer,
                                              scale_update_cell=update_cell)
    model = Model(netwithgrads)
    model.train(repeat_count,
                dataset,
                callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"),
                sink_size=args_opt.data_sink_steps)
示例#30
0
def train_on_ascend():
    config = config_ascend_quant
    print("training args: {}".format(args_opt))
    print("training configure: {}".format(config))
    print("parallel args: rank_id {}, device_id {}, rank_size {}".format(
        rank_id, device_id, rank_size))
    epoch_size = config.epoch_size

    # distribute init
    if run_distribute:
        context.set_auto_parallel_context(
            device_num=rank_size,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True)
        init()

    # define network
    network = mobilenetV2(num_classes=config.num_classes)
    # define loss
    if config.label_smooth > 0:
        loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth,
                                           num_classes=config.num_classes)
    else:
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    # define dataset
    dataset = create_dataset(dataset_path=args_opt.dataset_path,
                             do_train=True,
                             config=config,
                             device_target=args_opt.device_target,
                             repeat_num=1,
                             batch_size=config.batch_size)
    step_size = dataset.get_dataset_size()
    # load pre trained ckpt
    if args_opt.pre_trained:
        param_dict = load_checkpoint(args_opt.pre_trained)
        load_nonquant_param_into_quant_net(network, param_dict)
    # convert fusion network to quantization aware network
    network = quant.convert_quant_network(network,
                                          bn_fold=True,
                                          per_channel=[True, False],
                                          symmetric=[True, False])

    # get learning rate
    lr = Tensor(
        get_lr(global_step=config.start_epoch * step_size,
               lr_init=0,
               lr_end=0,
               lr_max=config.lr,
               warmup_epochs=config.warmup_epochs,
               total_epochs=epoch_size + config.start_epoch,
               steps_per_epoch=step_size))

    # define optimization
    opt = nn.Momentum(
        filter(lambda x: x.requires_grad, network.get_parameters()), lr,
        config.momentum, config.weight_decay)
    # define model
    model = Model(network, loss_fn=loss, optimizer=opt)

    print("============== Starting Training ==============")
    callback = None
    if rank_id == 0:
        callback = [Monitor(lr_init=lr.asnumpy())]
        if config.save_checkpoint:
            config_ck = CheckpointConfig(
                save_checkpoint_steps=config.save_checkpoint_epochs *
                step_size,
                keep_checkpoint_max=config.keep_checkpoint_max)
            ckpt_cb = ModelCheckpoint(prefix="mobilenetV2",
                                      directory=config.save_checkpoint_path,
                                      config=config_ck)
            callback += [ckpt_cb]
    model.train(epoch_size, dataset, callbacks=callback)
    print("============== End Training ==============")