示例#1
0
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    summary = Summary(args.log_dir, args)
    if args.do_train:
        print('| Training Start')
        for epoch in range(args.num_epochs):
            metric = Metric(desc='finetune',
                            print_steps=args.loss_print_every_n_iter,
                            summary=summary,
                            batch_size=batch_size,
                            keys=['loss'])

            for step in range(epoch_size):
                BertGlueFinetuneJob().async_get(
                    metric.metric_cb(step, epoch=epoch))

            run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train')
            run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')

        if args.save_last_snapshot:
            snapshot.save("last_snapshot")

    if args.do_eval:
        print('| Evaluation Start')
        run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
def main():
    InitNodes(args)
    flow.env.grpc_use_no_signal()
    flow.env.log_dir(args.log_dir)

    summary = Summary(args.log_dir, args)
    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    for epoch in range(args.num_epochs):
        metric = Metric(desc='train',
                        calculate_batches=args.loss_print_every_n_iter,
                        summary=summary,
                        save_summary_steps=epoch_size,
                        batch_size=train_batch_size,
                        loss_key='loss')
        for i in range(epoch_size):
            TrainNet().async_get(metric.metric_cb(epoch, i))

        if args.val_data_dir:
            metric = Metric(desc='validation',
                            calculate_batches=num_val_steps,
                            summary=summary,
                            save_summary_steps=num_val_steps,
                            batch_size=val_batch_size)
            for i in range(num_val_steps):
                InferenceNet().async_get(metric.metric_cb(epoch, i))
        snapshot.save('epoch_{}'.format(epoch))
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
    #if args.save_and_break:
    #    print("save model just after init and exit")
    #    snapshot.save("initial_snapshot")
    #    import sys
    #    sys.exit()

    for epoch in range(args.num_epochs):
        metric = Metric(desc='finetune',
                        print_steps=args.loss_print_every_n_iter,
                        batch_size=batch_size,
                        keys=['loss'])

        for step in range(epoch_size):
            BertGlueFinetuneJob().async_get(metric.metric_cb(step,
                                                             epoch=epoch))
            #if 1: #step % args.loss_print_every_n_iter == 0:

        run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train')
        run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')

    if args.save_last_snapshot:
        snapshot.save("last_snapshot")
示例#4
0
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    if args.do_train or args.do_eval:
        snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    if args.do_train:
        summary = Summary(args.log_dir, args)
        for epoch in range(args.num_epochs):
            metric = Metric(desc='train',
                            print_steps=args.loss_print_every_n_iter,
                            summary=summary,
                            batch_size=batch_size,
                            keys=['total_loss'])

            for step in range(epoch_size):
                SquadFinetuneJob().async_get(
                    metric.metric_cb(step, epoch=epoch))

        if args.save_last_snapshot:
            snapshot.save("last_snapshot")

    if args.do_eval:
        assert os.path.isdir(args.eval_data_dir)
        all_results = []
        for step in range(num_eval_steps):
            unique_ids, start_positions, end_positions = SquadDevJob().get()
            unique_ids = unique_ids.numpy()
            start_positions = start_positions.numpy()
            end_positions = end_positions.numpy()

            for unique_id, start_position, end_position in zip(
                    unique_ids, start_positions, end_positions):
                all_results.append(
                    RawResult(
                        unique_id=int(unique_id[0]),
                        start_logits=start_position.flatten().tolist(),
                        end_logits=end_position.flatten().tolist(),
                    ))

            if step % args.loss_print_every_n_iter == 0:
                print("{}/{}, num of results:{}".format(
                    step, num_eval_steps, len(all_results)))
                print("last uid:", unique_id[0])

        gen_eval_predict_json(args, all_results)
def main():
    InitNodes(args)
    flow.env.log_dir(args.log_dir)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir,
                        args.save_init)

    print(" {} iter per epoch...".format(epoch_size))

    for epoch in range(1, args.num_epochs + 1):
        metric = Metric(
            desc="train",
            calculate_batches=args.loss_print_every_n_iter,
            batch_size=train_batch_size,
            loss_key="loss",
        )
        for i in range(epoch_size):
            TrainNet().async_get(metric.metric_cb(epoch, i))

        if args.val_data_dir:
            metric = Metric(
                desc="validation",
                calculate_batches=num_val_steps,
                batch_size=val_batch_size,
            )
            for i in range(num_val_steps):
                InferenceNet().async_get(metric.metric_cb(epoch, i))
        if epoch % args.save_epoch_interval == 0:
            snapshot.save("epoch_{}".format(epoch))

    if args.save_last:
        snapshot.save("epoch_{}".format("last"))
示例#6
0
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    metric = Metric(desc='train',
                    print_steps=args.loss_print_every_n_iter,
                    batch_size=batch_size,
                    keys=['total_loss', 'mlm_loss', 'nsp_loss'])
    for step in range(args.iter_num):
        PretrainJob().async_get(metric.metric_cb(step))
        #PretrainJob().async_get(metric.metric_cb(step, epoch=3))
        if (step + 1) % args.model_save_every_n_iter == 0:
            snapshot.save("snapshot_%d" % (step + 1))

    if args.save_last_snapshot:
        snapshot.save("last_snapshot")
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir, args.model_save_init)

    print("num_accumulation_steps:", args.num_accumulation_steps)
    metric = Metric(
        desc="train",
        print_steps=args.loss_print_every_n_iter,
        batch_size=batch_size * args.num_accumulation_steps,
        keys=["total_loss", "mlm_loss", "nsp_loss"],
    )

    for step in range(args.iter_num):
        PretrainJob().async_get(metric.metric_cb(step))
        # PretrainJob().async_get(metric.metric_cb(step, epoch=3))
        if (step + 1) % args.model_save_every_n_iter == 0:
            snapshot.save("snapshot_%d" % (step + 1))

    if args.save_last_snapshot:
        snapshot.save("last_snapshot")
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)
    if args.do_train:
        snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

        summary = Summary(args.log_dir, args)
        best_dev_acc = 0.0
        best_result = {}
        for epoch in range(args.num_epochs):
            metric = Metric(desc='finetune',
                            print_steps=args.loss_print_every_n_iter,
                            summary=summary,
                            batch_size=batch_size,
                            keys=['loss'])

            for step in range(epoch_size):
                BertGlueFinetuneJob().async_get(
                    metric.metric_cb(step, epoch=epoch))
                #if 1: #step % args.loss_print_every_n_iter == 0:

            run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train')
            result = run_eval_job(BertGlueEvalValJob,
                                  num_eval_steps,
                                  desc='eval')

            save_model = False
            if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
                best_dev_acc = result['accuracy']
                best_result = result
                save_model = True
                print('Best result:', result)

            # if task_name in corr_tasks and result['corr'] > best_dev_acc:
            #     best_dev_acc = result['corr']
            #     best_result = result
            #     save_model = True
            #print('Best result:', result)

            if task_name in mcc_tasks and result[
                    'matthews_corrcoef'] > best_dev_acc:
                best_dev_acc = result['matthews_corrcoef']
                best_result = result
                save_model = True
                print('Best result:', result)

            if save_model:
                if not os.path.exists(args.model_save_dir):
                    os.makedirs(args.model_save_dir)
                # snapshot_save_path = os.path.join(args.model_save_dir)
                # print("Saving best model to {}".format(snapshot_save_path))
                snapshot.save('best')
                flow.sync_default_session()
        print('Best result:', best_result)
        print("Saving best model to " +
              os.path.join(args.model_save_dir, 'snapshot_best'))

        if args.serve_for_online:
            print('Deleting the optimizer parmas from model_save_dir...')
            remove_optimizer_params(
                os.path.join(args.model_save_dir, 'snapshot_best'))

        # if args.save_last_snapshot:
        #     snapshot.save("last_snapshot")
    if args.do_eval:
        print('Loading model...')
        print(args.model_save_dir)
        if not args.do_train:
            check_point = flow.train.CheckPoint()
            check_point.load(args.model_save_dir)
        print('Evaluation...')
        run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
            if str(root[-2:]) == '-v' or str(root[-2:]) == '-m':
                pass
            else:
                tmp = os.path.getsize(os.path.join(root, name))
                size += tmp
        # size += sum([os.path.getsize(os.path.join(root, name)) for name in files])
    return size


def main():
    InitNodes(args)
    flow.env.log_dir(args.log_dir)
    modelSize = getdirsize(args.model_load_dir)
    summary = Summary(args.log_dir, args, modelSize)
>>>>>>> tianshu
    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    for epoch in range(args.num_epochs):
        metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter,
                        summary=summary, save_summary_steps=epoch_size,
                        batch_size=train_batch_size, loss_key='loss')
        for i in range(epoch_size):
            TrainNet().async_get(metric.metric_cb(epoch, i))
<<<<<<< HEAD
       # flow.tensorrt.write_int8_calibration("./int8_calibration") # mkdir int8_calibration
=======
        # flow.tensorrt.write_int8_calibration("./int8_calibration") # mkdir int8_calibration
>>>>>>> tianshu
        if args.val_data_dir:
            metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary,
                            save_summary_steps=num_val_steps, batch_size=val_batch_size)