示例#1
0
def GetInterfaceBlobValue(op_name):
    flow.sync_default_session()

    sess = session_ctx.GetDefaultSession()
    job_name = sess.JobName4InterfaceOpName(op_name)

    def AsyncGetInterfaceBlobValue(Yield):
        def build(builder):
            blob_object = GetEagerInterfaceBlob(op_name).blob_object
            lbi = lbi_util.LogicalBlobId()
            lbi.set_op_name(op_name)
            op_attribute = sess.OpAttribute4InterfaceOpName(op_name)
            assert len(op_attribute.output_bns) == 1
            lbi.set_blob_name(op_attribute.output_bns[0])
            if not isinstance(lbi, lbi_util.LogicalBlobId):
                cfg_lbi = lbi_util.LogicalBlobId()
                cfg_lbi.set_op_name(lbi.op_name)
                cfg_lbi.set_blob_name(lbi.blob_name)
                lbi = cfg_lbi
            if blob_object.op_arg_parallel_attr.is_mirrored():
                remote_blob = oneflow_api.EagerMirroredBlob(
                    lbi, blob_object, blob_register, job_name
                )
            else:
                remote_blob = oneflow_api.EagerConsistentBlob(
                    lbi, blob_object, blob_register, job_name
                )
            value = remote_blob.numpy()
            Yield(value)

        oneflow_api.deprecated.LogicalRun(build)

    return async_util.Await(1, AsyncGetInterfaceBlobValue)[0]
示例#2
0
def FeedValueToInterfaceBlob(op_name, ndarray):
    flow.sync_default_session()

    def AsyncFeedValueToInterfaceBlob(Yield):
        def build(builder):
            blob_object = builder.MakeLazyRefBlobObject(op_name)
            if blob_object.op_arg_blob_attr.is_tensor_list:
                input_blob_def = input_blob_def_util.MirroredTensorListDef(
                    [x.shape for x in ndarray],
                    dtype=dtype_util.convert_numpy_dtype_to_oneflow_dtype(
                        ndarray.dtype),
                )
            elif blob_object.op_arg_parallel_attr.is_mirrored():
                input_blob_def = input_blob_def_util.MirroredTensorDef(
                    ndarray.shape,
                    dtype=dtype_util.convert_numpy_dtype_to_oneflow_dtype(
                        ndarray.dtype),
                )
            else:
                input_blob_def = input_blob_def_util.FixedTensorDef(
                    ndarray.shape,
                    dtype=dtype_util.convert_numpy_dtype_to_oneflow_dtype(
                        ndarray.dtype),
                )
            push_util.FeedValueToEagerBlob(blob_object, input_blob_def,
                                           ndarray)
            Yield()

        vm_util.LogicalRun(build)

    async_util.Await(1, AsyncFeedValueToInterfaceBlob)
示例#3
0
def GetEagerInterfaceBlob(op_name):
    flow.sync_default_session()

    sess = session_ctx.GetDefaultSession()

    def CreateBlob():
        job_name = sess.JobName4InterfaceOpName(op_name)

        def Build(builder, Yield):
            blob_object = _GetInterfaceBlobObject(builder, op_name)
            lbi = lbi_util.LogicalBlobId()
            lbi.set_op_name(op_name)
            op_attribute = sess.OpAttribute4InterfaceOpName(op_name)
            assert len(op_attribute.output_bns) == 1
            lbi.set_blob_name(op_attribute.output_bns[0])
            if blob_object.op_arg_parallel_attr.is_mirrored():
                remote_blob = oneflow_api.EagerMirroredBlob(
                    lbi, blob_object, blob_register, job_name
                )
            else:
                remote_blob = oneflow_api.EagerConsistentBlob(
                    lbi, blob_object, blob_register, job_name
                )

            Yield(remote_blob)

        def AsyncGetInterfaceBlob(Yield):
            oneflow_api.deprecated.LogicalRun(lambda builder: Build(builder, Yield))

        blob = async_util.Await(1, AsyncGetInterfaceBlob)[0]
        return blob

    return sess.FindOrCreateLazyBlob(op_name, CreateBlob)
def sync_default_session_if_normal():
    # TODO merge with same function in framework/check_point_v2.py
    if rt_mode.CurrentMode() == rt_mode.NORMAL_MODE:
        flow.sync_default_session()
    else:
        # do nothing
        pass
示例#5
0
def GetInterfaceBlobValue(op_name):
    flow.sync_default_session()

    sess = session_ctx.GetDefaultSession()
    job_name = sess.JobName4InterfaceOpName(op_name)

    def AsyncGetInterfaceBlobValue(Yield):
        def build(builder):
            blob_object = builder.MakeLazyRefBlobObject(op_name)
            lbi = logical_blob_id_util.LogicalBlobId()
            lbi.op_name = op_name
            op_attribute = sess.OpAttribute4InterfaceOpName(op_name)
            assert len(op_attribute.output_bns) == 1
            lbi.blob_name = op_attribute.output_bns[0]
            if blob_object.op_arg_parallel_attr.is_mirrored():
                remote_blob = remote_blob_util.EagerMirroredBlob(
                    lbi, blob_object, job_name)
            else:
                remote_blob = remote_blob_util.EagerConsistentBlob(
                    lbi, blob_object, job_name)
            if blob_object.op_arg_blob_attr.is_tensor_list:
                value = remote_blob.numpy_list()
            else:
                value = remote_blob.numpy()

            Yield(value)

        vm_util.LogicalRun(build)

    return async_util.Await(1, AsyncGetInterfaceBlobValue)[0]
示例#6
0
def sync_default_session_if_normal():
    # TODO merge with same function in experimental/interface_op_read_and_write.py
    if rt_mode.CurrentMode() == rt_mode.NORMAL_MODE:
        oneflow.sync_default_session()
    else:
        # do nothing
        pass
示例#7
0
def Init() -> None:
    oneflow.sync_default_session()

    sess = session_ctx.GetDefaultSession()
    for op_name, var_blob in GetAllVariables().items():
        var_conf = sess.OpAttribute4InterfaceOpName(
            op_name).op_conf.variable_conf
        if not (var_conf.HasField("initializer")
                or var_conf.HasField("initialize_with_snapshot")):
            continue
        if var_conf.HasField("initialize_with_snapshot"):
            initialize_with_snapshot_conf = var_conf.initialize_with_snapshot
            if initialize_with_snapshot_conf.HasField("key"):
                snapshot_key = op_name
            else:
                snapshot_key = initialize_with_snapshot_conf.key
            var_dir = os.path.dirname(
                os.path.join(
                    initialize_with_snapshot_conf.path,
                    snapshot_key,
                ))
            LoadVariables({op_name: GetCheckpoint(var_dir)})
            continue

        scope_symbol_id = _GetScopeSymbolIdFromEagerBlob(var_blob)
        init_by_initializer_conf(var_blob, var_conf.initializer, False,
                                 scope_symbol_id, var_conf.random_seed)

    oneflow_api.eager.Sync()
示例#8
0
def _TestSaveCorrectness(test_case, model_getter, dtype, legacy_api):
    """
    Save weights by new model io, load weights by legacy model io,
    and check the equality.
    """
    with tempfile.TemporaryDirectory() as save_dir:
        refresh_session()
        flow.config.enable_legacy_model_io(False)

        large1 = get_checkpoint_ready_model(model_getter, dtype)

        if legacy_api:
            check_point = flow.train.CheckPoint()
            check_point.save(save_dir)
        else:
            flow.checkpoint.save(save_dir)
        res1 = large1()

        refresh_session()
        flow.config.enable_legacy_model_io(True)

        large2 = get_checkpoint_ready_model(model_getter, dtype)

        check_point = flow.train.CheckPoint()
        check_point.load(save_dir)
        flow.sync_default_session()

        res2 = large2()
        test_case.assertTrue(np.array_equal(res1, res2))
示例#9
0
def FeedValueToInterfaceBlob(op_name, ndarray):
    flow.sync_default_session()

    def AsyncFeedValueToInterfaceBlob(Yield):
        def build(builder):
            blob_object = GetEagerInterfaceBlob(op_name).blob_object
            FeedValueToInterfaceBlobObject(blob_object, ndarray)
            Yield()

        oneflow_api.deprecated.LogicalRun(build)

    async_util.Await(1, AsyncFeedValueToInterfaceBlob)
def main():
    InitNodes(args)

    flow.env.grpc_use_no_signal()
    flow.env.log_dir(args.log_dir)

    summary = Summary(args.log_dir, args)
    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
    #open log file
    log_file = open(
        "./log/log_" + args.model + "_" + args.data_type + "_" +
        args.log_type + ".txt", "w")
    if not args.before_result_dir:
        args.before_result_dir = "./log/before"
    if not args.after_result_dir:
        args.after_result_dir = "./log/after"

    for epoch in range(args.num_epochs):
        #config callback func during training
        metric = Metric(desc='train',
                        calculate_batches=args.loss_print_every_n_iter,
                        summary=summary,
                        save_summary_steps=epoch_size,
                        batch_size=train_batch_size,
                        loss_key='loss')
        #training...(epoch times = epoch_size)
        for i in range(epoch_size):
            TrainNet().async_get(metric.metric_cb(epoch, i))

        if args.val_data_dir:
            #config callback func during testing
            metric = Metric(desc='validation',
                            calculate_batches=num_val_steps,
                            summary=summary,
                            save_summary_steps=num_val_steps,
                            batch_size=val_batch_size)
            #tesing
            for i in range(num_val_steps):
                InferenceNet().async_get(
                    metric.metric_cb(epoch, i, args=args, log_file=log_file))
        if epoch % args.model_save_every_n_epoch == 0:
            snapshot.save('epoch_{}'.format(epoch))
            flow.sync_default_session()
    #save last_snapeshot and model weight
    snapshot.save('last')
    flow.sync_default_session()
    weights_profile_path = os.path.join(args.model_save_dir,
                                        "weights_profile_path")
    modelWeight.save(weights_profile_path)
示例#11
0
def GetAllVariables() -> Dict[str, oneflow_api.EagerConsistentBlob]:
    """
    Get all variables of all jobs as a dict.
    """
    oneflow.sync_default_session()

    sess = session_ctx.GetDefaultSession()
    interface_ops = sess.interface_ops
    variables = {}
    for op in interface_ops:
        op_attr = sess.OpAttribute4InterfaceOpName(op)
        if op_attr.op_conf.WhichOneof("op_type") != "variable_conf":
            continue
        variables[op] = interface_op_read_and_write.GetEagerInterfaceBlob(op)
    return variables
示例#12
0
def Init() -> None:
    oneflow.sync_default_session()

    sess = session_ctx.GetDefaultSession()
    for op_name, var_blob in GetAllVariables().items():
        var_conf = sess.OpAttribute4InterfaceOpName(
            op_name).op_conf.variable_conf
        if not (var_conf.HasField("initializer")
                or var_conf.HasField("initialize_with_snapshot")):
            continue
        if var_conf.HasField("initialize_with_snapshot"):
            initialize_with_snapshot_conf = var_conf.initialize_with_snapshot
            if initialize_with_snapshot_conf.HasField("key"):
                snapshot_key = op_name
            else:
                snapshot_key = initialize_with_snapshot_conf.key
            var_dir = os.path.dirname(
                os.path.join(
                    initialize_with_snapshot_conf.path,
                    snapshot_key,
                ))
            LoadVariables({op_name: GetCheckpoint(var_dir)})
            continue
        g = initializer_util.GetInitializer(var_conf.initializer,
                                            var_conf.random_seed,
                                            var_blob.shape)

        def GenerateValueAndAssign(var_blob, start_nd_idx, stop_nd_idx):
            np_dtype = np.dtype(
                dtype_util.convert_oneflow_dtype_to_numpy_dtype(
                    var_blob.dtype))
            length = _ElemCnt(np.array(stop_nd_idx) - np.array(start_nd_idx))
            vals = (np.array(g(length)).astype(np_dtype).reshape(
                np.array(stop_nd_idx) - np.array(start_nd_idx)))

            slice_value_blob = _GetCpu0VariableBlobFromNumpy(
                vals, var_blob.dtype)
            _LogicalSliceAssign(
                var_blob,
                slice_value_blob,
                start_nd_idx,
                stop_nd_idx,
            )

        # we just want to run f on every slice without caring about the return value
        for _ in _ForEachSlice(var_blob, GenerateValueAndAssign):
            pass
    oneflow_api.eager.Sync()
示例#13
0
def FeedValueToInterfaceBlobObject(blob_object, ndarray):
    flow.sync_default_session()

    def build(builder):
        if blob_object.op_arg_parallel_attr.is_mirrored():
            input_blob_def = input_blob_def_util.MirroredTensorDef(
                ndarray.shape,
                dtype=dtype_util.convert_numpy_dtype_to_oneflow_dtype(ndarray.dtype),
            )
        else:
            input_blob_def = input_blob_def_util.FixedTensorDef(
                ndarray.shape,
                dtype=dtype_util.convert_numpy_dtype_to_oneflow_dtype(ndarray.dtype),
            )
        push_util.FeedValueToEagerBlob(blob_object, input_blob_def, ndarray)

    oneflow_api.deprecated.LogicalRun(build)
示例#14
0
def SaveVarDict(
    path: str,
    var_dict: Optional[Dict[str, Union[FileBackendVariableBlob,
                                       EagerBlobTrait]]] = None,
) -> None:
    """
    Save `var_dict` to `path`
    """
    oneflow.sync_default_session()

    if var_dict is None:
        var_dict = GetAllVariables()

    def IsFileOrNonEmptyDir(path):
        if os.path.isfile(path):
            return True
        if os.path.isdir(path) and len(os.listdir(path)) != 0:
            return True
        return False

    assert not IsFileOrNonEmptyDir(
        path), "Non-empty directory {} already exists!".format(path)
    os.makedirs(path, exist_ok=True)
    for name, var in var_dict.items():
        meta_info = variable_meta_info_pb.VariableMetaInfo()
        meta_info.shape.dim[:] = var.shape
        meta_info.data_type = oneflow_api.deprecated.GetProtoDtype4OfDtype(
            var.dtype)
        var_dir = os.path.join(path, name)
        param_path = os.path.join(var_dir, DATA_FILENAME)
        os.makedirs(os.path.dirname(param_path))
        with open(param_path, "wb") as f:
            for _, _, slice in _ReadSlice(var):
                f.write(slice.tobytes())
        with open(os.path.join(var_dir, META_INFO_FILENAME), "w") as f:
            f.write(text_format.MessageToString(meta_info))
    # write a empty file 'snapshot_done', indicating that
    # the save process finishes normally
    with open(os.path.join(path, "snapshot_done"), "w"):
        pass
示例#15
0
def LoadVariables(
    value_dict: Dict[str, ValueContainer],
    ignore_mismatch: bool = True,
):
    """
    Load value in `value_dict` into oneflow variables.
    For example, if `value_dict` is {'x', np.ones(x_shape)},
    the value of variable "x" will all ones.
    If `ignore_mismatch` is False, an exception will be raised when
    there is a name in `value_dict` not belonging to any variable.
    """
    oneflow.sync_default_session()

    all_vars = GetAllVariables()
    for name, value in value_dict.items():
        if name in all_vars:
            var_blob = interface_op_read_and_write.GetEagerInterfaceBlob(name)
            _FeedValueToVariable(var_blob, value)
        else:
            if not ignore_mismatch:
                raise RuntimeError('"{}" is not a variable name'.format(name))
    oneflow_api.eager.Sync()
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)
    if args.do_train:
        snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

        summary = Summary(args.log_dir, args)
        best_dev_acc = 0.0
        best_result = {}
        for epoch in range(args.num_epochs):
            metric = Metric(desc='finetune',
                            print_steps=args.loss_print_every_n_iter,
                            summary=summary,
                            batch_size=batch_size,
                            keys=['loss'])

            for step in range(epoch_size):
                BertGlueFinetuneJob().async_get(
                    metric.metric_cb(step, epoch=epoch))
                #if 1: #step % args.loss_print_every_n_iter == 0:

            run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train')
            result = run_eval_job(BertGlueEvalValJob,
                                  num_eval_steps,
                                  desc='eval')

            save_model = False
            if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
                best_dev_acc = result['accuracy']
                best_result = result
                save_model = True
                print('Best result:', result)

            # if task_name in corr_tasks and result['corr'] > best_dev_acc:
            #     best_dev_acc = result['corr']
            #     best_result = result
            #     save_model = True
            #print('Best result:', result)

            if task_name in mcc_tasks and result[
                    'matthews_corrcoef'] > best_dev_acc:
                best_dev_acc = result['matthews_corrcoef']
                best_result = result
                save_model = True
                print('Best result:', result)

            if save_model:
                if not os.path.exists(args.model_save_dir):
                    os.makedirs(args.model_save_dir)
                # snapshot_save_path = os.path.join(args.model_save_dir)
                # print("Saving best model to {}".format(snapshot_save_path))
                snapshot.save('best')
                flow.sync_default_session()
        print('Best result:', best_result)
        print("Saving best model to " +
              os.path.join(args.model_save_dir, 'snapshot_best'))

        if args.serve_for_online:
            print('Deleting the optimizer parmas from model_save_dir...')
            remove_optimizer_params(
                os.path.join(args.model_save_dir, 'snapshot_best'))

        # if args.save_last_snapshot:
        #     snapshot.save("last_snapshot")
    if args.do_eval:
        print('Loading model...')
        print(args.model_save_dir)
        if not args.do_train:
            check_point = flow.train.CheckPoint()
            check_point.load(args.model_save_dir)
        print('Evaluation...')
        run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    check_point = flow.train.CheckPoint()

    summary = Summary(args.log_dir, args)
    if not os.path.exists(args.model_save_dir):
        os.makedirs(args.model_save_dir)
    if args.do_train:
        print('Combining two models into one dir')
        if not os.path.exists('./tmp'):
            os.makedirs('./tmp')

        args.total_model = tempfile.mkdtemp(dir='./tmp')
        CopyFile(args.student_model, args.total_model)
        CopyFile(args.teacher_model, args.total_model)
        print('Loading model...')
        check_point.load(args.total_model)
        #     # check_point.load(args.teacher_model)
        #     # check_point.load(args.student_model)
        #
        print('Start training...')
        global_step = 0
        best_dev_acc = 0.0
        for epoch in range(args.num_epochs):
            metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary,
                            batch_size=batch_size, keys=['loss'])

            for step in range(epoch_size):
                DistilJob().async_get(metric.metric_cb(step, epoch=epoch))
                global_step += 1
                # if (global_step + 1) % args.model_save_every_n_iter == 0:
                #     if not os.path.exists(args.model_save_dir):
                #         os.makedirs(args.model_save_dir)
                #     snapshot_save_path = os.path.join(
                #         args.model_save_dir, "snapshot_%d" % (global_step + 1)
                #     )
                #     print("Saving model to {}.".format(snapshot_save_path))
                #     check_point.save(snapshot_save_path)

            # if args.pred_distill:
            print('EvalTrainJob...')
            run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train')
            print('EvalValJob...')
            result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
            if not args.pred_distill:
                save_model = True
            else:
                save_model = False
                if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
                    best_dev_acc = result['accuracy']
                    save_model = True

                # if task_name in corr_tasks and result['corr'] > best_dev_acc:
                #     best_dev_acc = result['corr']
                #     save_model = True

                if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc:
                    best_dev_acc = result['matthews_corrcoef']
                    save_model = True
                    print('Best result:', result)

                if save_model:
                    if os.path.exists(args.model_save_dir):
                        import shutil
                        shutil.rmtree(args.model_save_dir)
                    if not os.path.exists(args.model_save_dir):
                        os.makedirs(args.model_save_dir)
                    snapshot_save_path = os.path.join(args.model_save_dir)
                    print("Saving best model to {}".format(snapshot_save_path))
                    check_point.save(snapshot_save_path)
                    flow.sync_default_session()

        if args.save_last_snapshot:
            snapshot_save_path = args.model_save_dir
            if os.path.exists(args.model_save_dir):
                import shutil
                shutil.rmtree(args.model_save_dir)
            print("Saving model to {}".format(snapshot_save_path))
            check_point.save(snapshot_save_path)
            flow.sync_default_session()

        if global_step >= 100:
            # remove tmp total models
            print('Removing the tmp models...')
            import shutil
            shutil.rmtree(args.total_model)

        if args.serve_for_online:
            print('Deleting the teacher params and the optimizer parmas from model_save_dir...')
            remove_teacher_params(args.model_save_dir)

    if args.do_eval:
        print('Loading model...')
        print(args.model_save_dir)

        if not args.do_train:
            check_point.load(args.model_save_dir)
        print('Evaluation...')
        run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')