def GetInterfaceBlobValue(op_name): flow.sync_default_session() sess = session_ctx.GetDefaultSession() job_name = sess.JobName4InterfaceOpName(op_name) def AsyncGetInterfaceBlobValue(Yield): def build(builder): blob_object = GetEagerInterfaceBlob(op_name).blob_object lbi = lbi_util.LogicalBlobId() lbi.set_op_name(op_name) op_attribute = sess.OpAttribute4InterfaceOpName(op_name) assert len(op_attribute.output_bns) == 1 lbi.set_blob_name(op_attribute.output_bns[0]) if not isinstance(lbi, lbi_util.LogicalBlobId): cfg_lbi = lbi_util.LogicalBlobId() cfg_lbi.set_op_name(lbi.op_name) cfg_lbi.set_blob_name(lbi.blob_name) lbi = cfg_lbi if blob_object.op_arg_parallel_attr.is_mirrored(): remote_blob = oneflow_api.EagerMirroredBlob( lbi, blob_object, blob_register, job_name ) else: remote_blob = oneflow_api.EagerConsistentBlob( lbi, blob_object, blob_register, job_name ) value = remote_blob.numpy() Yield(value) oneflow_api.deprecated.LogicalRun(build) return async_util.Await(1, AsyncGetInterfaceBlobValue)[0]
def FeedValueToInterfaceBlob(op_name, ndarray): flow.sync_default_session() def AsyncFeedValueToInterfaceBlob(Yield): def build(builder): blob_object = builder.MakeLazyRefBlobObject(op_name) if blob_object.op_arg_blob_attr.is_tensor_list: input_blob_def = input_blob_def_util.MirroredTensorListDef( [x.shape for x in ndarray], dtype=dtype_util.convert_numpy_dtype_to_oneflow_dtype( ndarray.dtype), ) elif blob_object.op_arg_parallel_attr.is_mirrored(): input_blob_def = input_blob_def_util.MirroredTensorDef( ndarray.shape, dtype=dtype_util.convert_numpy_dtype_to_oneflow_dtype( ndarray.dtype), ) else: input_blob_def = input_blob_def_util.FixedTensorDef( ndarray.shape, dtype=dtype_util.convert_numpy_dtype_to_oneflow_dtype( ndarray.dtype), ) push_util.FeedValueToEagerBlob(blob_object, input_blob_def, ndarray) Yield() vm_util.LogicalRun(build) async_util.Await(1, AsyncFeedValueToInterfaceBlob)
def GetEagerInterfaceBlob(op_name): flow.sync_default_session() sess = session_ctx.GetDefaultSession() def CreateBlob(): job_name = sess.JobName4InterfaceOpName(op_name) def Build(builder, Yield): blob_object = _GetInterfaceBlobObject(builder, op_name) lbi = lbi_util.LogicalBlobId() lbi.set_op_name(op_name) op_attribute = sess.OpAttribute4InterfaceOpName(op_name) assert len(op_attribute.output_bns) == 1 lbi.set_blob_name(op_attribute.output_bns[0]) if blob_object.op_arg_parallel_attr.is_mirrored(): remote_blob = oneflow_api.EagerMirroredBlob( lbi, blob_object, blob_register, job_name ) else: remote_blob = oneflow_api.EagerConsistentBlob( lbi, blob_object, blob_register, job_name ) Yield(remote_blob) def AsyncGetInterfaceBlob(Yield): oneflow_api.deprecated.LogicalRun(lambda builder: Build(builder, Yield)) blob = async_util.Await(1, AsyncGetInterfaceBlob)[0] return blob return sess.FindOrCreateLazyBlob(op_name, CreateBlob)
def sync_default_session_if_normal(): # TODO merge with same function in framework/check_point_v2.py if rt_mode.CurrentMode() == rt_mode.NORMAL_MODE: flow.sync_default_session() else: # do nothing pass
def GetInterfaceBlobValue(op_name): flow.sync_default_session() sess = session_ctx.GetDefaultSession() job_name = sess.JobName4InterfaceOpName(op_name) def AsyncGetInterfaceBlobValue(Yield): def build(builder): blob_object = builder.MakeLazyRefBlobObject(op_name) lbi = logical_blob_id_util.LogicalBlobId() lbi.op_name = op_name op_attribute = sess.OpAttribute4InterfaceOpName(op_name) assert len(op_attribute.output_bns) == 1 lbi.blob_name = op_attribute.output_bns[0] if blob_object.op_arg_parallel_attr.is_mirrored(): remote_blob = remote_blob_util.EagerMirroredBlob( lbi, blob_object, job_name) else: remote_blob = remote_blob_util.EagerConsistentBlob( lbi, blob_object, job_name) if blob_object.op_arg_blob_attr.is_tensor_list: value = remote_blob.numpy_list() else: value = remote_blob.numpy() Yield(value) vm_util.LogicalRun(build) return async_util.Await(1, AsyncGetInterfaceBlobValue)[0]
def sync_default_session_if_normal(): # TODO merge with same function in experimental/interface_op_read_and_write.py if rt_mode.CurrentMode() == rt_mode.NORMAL_MODE: oneflow.sync_default_session() else: # do nothing pass
def Init() -> None: oneflow.sync_default_session() sess = session_ctx.GetDefaultSession() for op_name, var_blob in GetAllVariables().items(): var_conf = sess.OpAttribute4InterfaceOpName( op_name).op_conf.variable_conf if not (var_conf.HasField("initializer") or var_conf.HasField("initialize_with_snapshot")): continue if var_conf.HasField("initialize_with_snapshot"): initialize_with_snapshot_conf = var_conf.initialize_with_snapshot if initialize_with_snapshot_conf.HasField("key"): snapshot_key = op_name else: snapshot_key = initialize_with_snapshot_conf.key var_dir = os.path.dirname( os.path.join( initialize_with_snapshot_conf.path, snapshot_key, )) LoadVariables({op_name: GetCheckpoint(var_dir)}) continue scope_symbol_id = _GetScopeSymbolIdFromEagerBlob(var_blob) init_by_initializer_conf(var_blob, var_conf.initializer, False, scope_symbol_id, var_conf.random_seed) oneflow_api.eager.Sync()
def _TestSaveCorrectness(test_case, model_getter, dtype, legacy_api): """ Save weights by new model io, load weights by legacy model io, and check the equality. """ with tempfile.TemporaryDirectory() as save_dir: refresh_session() flow.config.enable_legacy_model_io(False) large1 = get_checkpoint_ready_model(model_getter, dtype) if legacy_api: check_point = flow.train.CheckPoint() check_point.save(save_dir) else: flow.checkpoint.save(save_dir) res1 = large1() refresh_session() flow.config.enable_legacy_model_io(True) large2 = get_checkpoint_ready_model(model_getter, dtype) check_point = flow.train.CheckPoint() check_point.load(save_dir) flow.sync_default_session() res2 = large2() test_case.assertTrue(np.array_equal(res1, res2))
def FeedValueToInterfaceBlob(op_name, ndarray): flow.sync_default_session() def AsyncFeedValueToInterfaceBlob(Yield): def build(builder): blob_object = GetEagerInterfaceBlob(op_name).blob_object FeedValueToInterfaceBlobObject(blob_object, ndarray) Yield() oneflow_api.deprecated.LogicalRun(build) async_util.Await(1, AsyncFeedValueToInterfaceBlob)
def main(): InitNodes(args) flow.env.grpc_use_no_signal() flow.env.log_dir(args.log_dir) summary = Summary(args.log_dir, args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) #open log file log_file = open( "./log/log_" + args.model + "_" + args.data_type + "_" + args.log_type + ".txt", "w") if not args.before_result_dir: args.before_result_dir = "./log/before" if not args.after_result_dir: args.after_result_dir = "./log/after" for epoch in range(args.num_epochs): #config callback func during training metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter, summary=summary, save_summary_steps=epoch_size, batch_size=train_batch_size, loss_key='loss') #training...(epoch times = epoch_size) for i in range(epoch_size): TrainNet().async_get(metric.metric_cb(epoch, i)) if args.val_data_dir: #config callback func during testing metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary, save_summary_steps=num_val_steps, batch_size=val_batch_size) #tesing for i in range(num_val_steps): InferenceNet().async_get( metric.metric_cb(epoch, i, args=args, log_file=log_file)) if epoch % args.model_save_every_n_epoch == 0: snapshot.save('epoch_{}'.format(epoch)) flow.sync_default_session() #save last_snapeshot and model weight snapshot.save('last') flow.sync_default_session() weights_profile_path = os.path.join(args.model_save_dir, "weights_profile_path") modelWeight.save(weights_profile_path)
def GetAllVariables() -> Dict[str, oneflow_api.EagerConsistentBlob]: """ Get all variables of all jobs as a dict. """ oneflow.sync_default_session() sess = session_ctx.GetDefaultSession() interface_ops = sess.interface_ops variables = {} for op in interface_ops: op_attr = sess.OpAttribute4InterfaceOpName(op) if op_attr.op_conf.WhichOneof("op_type") != "variable_conf": continue variables[op] = interface_op_read_and_write.GetEagerInterfaceBlob(op) return variables
def Init() -> None: oneflow.sync_default_session() sess = session_ctx.GetDefaultSession() for op_name, var_blob in GetAllVariables().items(): var_conf = sess.OpAttribute4InterfaceOpName( op_name).op_conf.variable_conf if not (var_conf.HasField("initializer") or var_conf.HasField("initialize_with_snapshot")): continue if var_conf.HasField("initialize_with_snapshot"): initialize_with_snapshot_conf = var_conf.initialize_with_snapshot if initialize_with_snapshot_conf.HasField("key"): snapshot_key = op_name else: snapshot_key = initialize_with_snapshot_conf.key var_dir = os.path.dirname( os.path.join( initialize_with_snapshot_conf.path, snapshot_key, )) LoadVariables({op_name: GetCheckpoint(var_dir)}) continue g = initializer_util.GetInitializer(var_conf.initializer, var_conf.random_seed, var_blob.shape) def GenerateValueAndAssign(var_blob, start_nd_idx, stop_nd_idx): np_dtype = np.dtype( dtype_util.convert_oneflow_dtype_to_numpy_dtype( var_blob.dtype)) length = _ElemCnt(np.array(stop_nd_idx) - np.array(start_nd_idx)) vals = (np.array(g(length)).astype(np_dtype).reshape( np.array(stop_nd_idx) - np.array(start_nd_idx))) slice_value_blob = _GetCpu0VariableBlobFromNumpy( vals, var_blob.dtype) _LogicalSliceAssign( var_blob, slice_value_blob, start_nd_idx, stop_nd_idx, ) # we just want to run f on every slice without caring about the return value for _ in _ForEachSlice(var_blob, GenerateValueAndAssign): pass oneflow_api.eager.Sync()
def FeedValueToInterfaceBlobObject(blob_object, ndarray): flow.sync_default_session() def build(builder): if blob_object.op_arg_parallel_attr.is_mirrored(): input_blob_def = input_blob_def_util.MirroredTensorDef( ndarray.shape, dtype=dtype_util.convert_numpy_dtype_to_oneflow_dtype(ndarray.dtype), ) else: input_blob_def = input_blob_def_util.FixedTensorDef( ndarray.shape, dtype=dtype_util.convert_numpy_dtype_to_oneflow_dtype(ndarray.dtype), ) push_util.FeedValueToEagerBlob(blob_object, input_blob_def, ndarray) oneflow_api.deprecated.LogicalRun(build)
def SaveVarDict( path: str, var_dict: Optional[Dict[str, Union[FileBackendVariableBlob, EagerBlobTrait]]] = None, ) -> None: """ Save `var_dict` to `path` """ oneflow.sync_default_session() if var_dict is None: var_dict = GetAllVariables() def IsFileOrNonEmptyDir(path): if os.path.isfile(path): return True if os.path.isdir(path) and len(os.listdir(path)) != 0: return True return False assert not IsFileOrNonEmptyDir( path), "Non-empty directory {} already exists!".format(path) os.makedirs(path, exist_ok=True) for name, var in var_dict.items(): meta_info = variable_meta_info_pb.VariableMetaInfo() meta_info.shape.dim[:] = var.shape meta_info.data_type = oneflow_api.deprecated.GetProtoDtype4OfDtype( var.dtype) var_dir = os.path.join(path, name) param_path = os.path.join(var_dir, DATA_FILENAME) os.makedirs(os.path.dirname(param_path)) with open(param_path, "wb") as f: for _, _, slice in _ReadSlice(var): f.write(slice.tobytes()) with open(os.path.join(var_dir, META_INFO_FILENAME), "w") as f: f.write(text_format.MessageToString(meta_info)) # write a empty file 'snapshot_done', indicating that # the save process finishes normally with open(os.path.join(path, "snapshot_done"), "w"): pass
def LoadVariables( value_dict: Dict[str, ValueContainer], ignore_mismatch: bool = True, ): """ Load value in `value_dict` into oneflow variables. For example, if `value_dict` is {'x', np.ones(x_shape)}, the value of variable "x" will all ones. If `ignore_mismatch` is False, an exception will be raised when there is a name in `value_dict` not belonging to any variable. """ oneflow.sync_default_session() all_vars = GetAllVariables() for name, value in value_dict.items(): if name in all_vars: var_blob = interface_op_read_and_write.GetEagerInterfaceBlob(name) _FeedValueToVariable(var_blob, value) else: if not ignore_mismatch: raise RuntimeError('"{}" is not a variable name'.format(name)) oneflow_api.eager.Sync()
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) if args.do_train: snapshot = Snapshot(args.model_save_dir, args.model_load_dir) summary = Summary(args.log_dir, args) best_dev_acc = 0.0 best_result = {} for epoch in range(args.num_epochs): metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, batch_size=batch_size, keys=['loss']) for step in range(epoch_size): BertGlueFinetuneJob().async_get( metric.metric_cb(step, epoch=epoch)) #if 1: #step % args.loss_print_every_n_iter == 0: run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train') result = run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval') save_model = False if task_name in acc_tasks and result['accuracy'] > best_dev_acc: best_dev_acc = result['accuracy'] best_result = result save_model = True print('Best result:', result) # if task_name in corr_tasks and result['corr'] > best_dev_acc: # best_dev_acc = result['corr'] # best_result = result # save_model = True #print('Best result:', result) if task_name in mcc_tasks and result[ 'matthews_corrcoef'] > best_dev_acc: best_dev_acc = result['matthews_corrcoef'] best_result = result save_model = True print('Best result:', result) if save_model: if not os.path.exists(args.model_save_dir): os.makedirs(args.model_save_dir) # snapshot_save_path = os.path.join(args.model_save_dir) # print("Saving best model to {}".format(snapshot_save_path)) snapshot.save('best') flow.sync_default_session() print('Best result:', best_result) print("Saving best model to " + os.path.join(args.model_save_dir, 'snapshot_best')) if args.serve_for_online: print('Deleting the optimizer parmas from model_save_dir...') remove_optimizer_params( os.path.join(args.model_save_dir, 'snapshot_best')) # if args.save_last_snapshot: # snapshot.save("last_snapshot") if args.do_eval: print('Loading model...') print(args.model_save_dir) if not args.do_train: check_point = flow.train.CheckPoint() check_point.load(args.model_save_dir) print('Evaluation...') run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) check_point = flow.train.CheckPoint() summary = Summary(args.log_dir, args) if not os.path.exists(args.model_save_dir): os.makedirs(args.model_save_dir) if args.do_train: print('Combining two models into one dir') if not os.path.exists('./tmp'): os.makedirs('./tmp') args.total_model = tempfile.mkdtemp(dir='./tmp') CopyFile(args.student_model, args.total_model) CopyFile(args.teacher_model, args.total_model) print('Loading model...') check_point.load(args.total_model) # # check_point.load(args.teacher_model) # # check_point.load(args.student_model) # print('Start training...') global_step = 0 best_dev_acc = 0.0 for epoch in range(args.num_epochs): metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, batch_size=batch_size, keys=['loss']) for step in range(epoch_size): DistilJob().async_get(metric.metric_cb(step, epoch=epoch)) global_step += 1 # if (global_step + 1) % args.model_save_every_n_iter == 0: # if not os.path.exists(args.model_save_dir): # os.makedirs(args.model_save_dir) # snapshot_save_path = os.path.join( # args.model_save_dir, "snapshot_%d" % (global_step + 1) # ) # print("Saving model to {}.".format(snapshot_save_path)) # check_point.save(snapshot_save_path) # if args.pred_distill: print('EvalTrainJob...') run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train') print('EvalValJob...') result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval') if not args.pred_distill: save_model = True else: save_model = False if task_name in acc_tasks and result['accuracy'] > best_dev_acc: best_dev_acc = result['accuracy'] save_model = True # if task_name in corr_tasks and result['corr'] > best_dev_acc: # best_dev_acc = result['corr'] # save_model = True if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc: best_dev_acc = result['matthews_corrcoef'] save_model = True print('Best result:', result) if save_model: if os.path.exists(args.model_save_dir): import shutil shutil.rmtree(args.model_save_dir) if not os.path.exists(args.model_save_dir): os.makedirs(args.model_save_dir) snapshot_save_path = os.path.join(args.model_save_dir) print("Saving best model to {}".format(snapshot_save_path)) check_point.save(snapshot_save_path) flow.sync_default_session() if args.save_last_snapshot: snapshot_save_path = args.model_save_dir if os.path.exists(args.model_save_dir): import shutil shutil.rmtree(args.model_save_dir) print("Saving model to {}".format(snapshot_save_path)) check_point.save(snapshot_save_path) flow.sync_default_session() if global_step >= 100: # remove tmp total models print('Removing the tmp models...') import shutil shutil.rmtree(args.total_model) if args.serve_for_online: print('Deleting the teacher params and the optimizer parmas from model_save_dir...') remove_teacher_params(args.model_save_dir) if args.do_eval: print('Loading model...') print(args.model_save_dir) if not args.do_train: check_point.load(args.model_save_dir) print('Evaluation...') run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')