def convert_func(cfg, model_path, out_path, image_size): model_module = get_model(cfg.network, dropout=0.0, num_features=cfg.embedding_size).to("cuda") model_module.eval() print(model_module) model_graph = ModelGraph(model_module) model_graph._compile(flow.randn(1, 3, image_size, image_size).to("cuda")) with tempfile.TemporaryDirectory() as tmpdirname: new_parameters = dict() parameters = flow.load(model_path) for key, value in parameters.items(): if "num_batches_tracked" not in key: if key == "fc.weight": continue val = value new_key = key.replace("backbone.", "") new_parameters[new_key] = val model_module.load_state_dict(new_parameters) flow.save(model_module.state_dict(), tmpdirname) convert_to_onnx_and_check(model_graph, flow_weight_dir=tmpdirname, onnx_model_path="./", print_outlier=True)
def verify_math( model, name="", rtol=1e-5, atol=1e-5, inputs=flow.tensor( np.random.rand(100, 1), dtype=flow.float32, ), device="llvm", ): """verify_math""" if device == "cuda": model.to(device) inputs = inputs.to(device) graph = OneFlowGraph(model) graph._compile(inputs) mkdir(MODEL_HOME) flow.save(model.state_dict(), MODEL_HOME) out_flow = get_oneflow_output(graph, inputs) out_tvm = get_tvm_output(graph, MODEL_HOME, inputs, target=device) rmdir(MODEL_HOME) assert_shape(out_flow, out_tvm) tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
def verify_concat( model, name="", rtol=1e-5, atol=1e-5, inputs1=flow.tensor(np.random.randn(2, 5, 5, 4), dtype=flow.float32), inputs2=flow.tensor(np.random.randn(2, 5, 5, 2), dtype=flow.float32), inputs3=flow.tensor(np.random.randn(2, 5, 5, 3), dtype=flow.float32), device="llvm", ): """verify_concat""" if device == "cuda": model.to(device) inputs1 = inputs1.to(device) inputs2 = inputs2.to(device) inputs3 = inputs3.to(device) graph = OneFlowGraphV2(model) graph._compile(inputs1, inputs2, inputs3) mkdir(MODEL_HOME) flow.save(model.state_dict(), MODEL_HOME) out_flow = get_oneflow_concat_output(graph, inputs1, inputs2, inputs3) out_tvm = get_tvm_concat_output(graph, MODEL_HOME, inputs1, inputs2, inputs3, target=device) rmdir(MODEL_HOME) assert_shape(out_flow, out_tvm) tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
def train_by_oneflow(): x = Parameter(flow.Tensor(init_value, device=flow.device(device))) sgd = flow.optim.SGD([{ "params": [x], "lr": learning_rate, "momentum": momentum, "weight_decay": weight_decay, }]) def train_one_iter(grad): grad_tensor = flow.tensor( grad, dtype=flow.float32, requires_grad=False, device=flow.device(device), ) loss = flow.sum(x * grad_tensor) loss.backward() sgd.step() sgd.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) # test state_dict/load_state_dict if i == reload_state_step: state_dict = sgd.state_dict() sgd = flow.optim.SGD([x]) if save_load_by_pickle: with tempfile.TemporaryDirectory() as save_dir: flow.save(state_dict, save_dir) state_dict = flow.load(save_dir) sgd.load_state_dict(state_dict) return x
def convert_weights( model_type: str, from_model: str, from_path: str, config_path: str, to_model: str, dump_path: str, ): model_class = MODEL_CLASSES[to_model][model_type] config = ConfigBase(config_path) model = model_class(config) load_weights_fct = LOAD_WEIGHTS_MAPS[to_model][model_type][from_model] if to_model == "tf": input_ids = tf.ones([3, 4], dtype=tf.int32) model(input_ids) load_weights_fct(model, config, from_path) if to_model == "pt": torch.save(model.state_dict(), dump_path) elif to_model == "tf": model.save_weights(dump_path) elif to_model == "ms": mindspore.save_checkpoint(model, dump_path) elif to_model == "of": flow.save(model.state_dict(), dump_path) elif to_model == "pd": paddle.save(model.state_dict(), dump_path) print("Save {} model to {}".format(to_model, dump_path))
def verify_matmul( model, name="", rtol=1e-5, atol=1e-5, inputs1=flow.tensor(np.random.randn(2, 5), dtype=flow.float32), inputs2=flow.tensor(np.random.randn(5, 2), dtype=flow.float32), device="llvm", ): if device == "cuda": model.to(device) inputs1 = inputs1.to(device) inputs2 = inputs2.to(device) graph = OneFlowGraph_v3(model) graph._compile(inputs1, inputs2) mkdir(MODEL_HOME) flow.save(model.state_dict(), MODEL_HOME) out_flow = get_oneflow_elementwise_output(graph, inputs1, inputs2) out_tvm = get_tvm_elementwise_output(graph, MODEL_HOME, inputs1, inputs2, target=device) rmdir(MODEL_HOME) assert_shape(out_flow, out_tvm) tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
def save_checkpoint(self, params, name): flow.save(params, os.path.join(name, "params.tar")) flow.save(self.frontend.state_dict(), os.path.join(name, "frontend.pt")) flow.save(self.encoder.state_dict(), os.path.join(name, "encoder.pt")) flow.save(self.decoder.state_dict(), os.path.join(name, "decoder.pt")) if self.ctc_weight > 0.0: flow.save(self.assistor.state_dict(), os.path.join(name, "ctc.pt"))
def train_by_oneflow(): x = Parameter(flow.Tensor(init_value, device=flow.device(device))) adagrad = flow.optim.Adagrad( [ { "params": [x], "lr": learning_rate, "eps": eps, "weight_decay": weight_decay, } ], lr_decay=lr_decay, initial_accumulator_value=initial_accumulator_value, ) def train_one_iter(grad): grad_tensor = flow.tensor( grad, requires_grad=False, device=flow.device(device) ) loss = flow.sum(x * grad_tensor) loss.backward() adagrad.step() adagrad.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) if i == reload_state_step: state_dict = adagrad.state_dict() adagrad = flow.optim.Adagrad([x]) if save_load_by_pickle: with tempfile.TemporaryDirectory() as save_dir: flow.save(state_dict, save_dir) state_dict = flow.load(save_dir) adagrad.load_state_dict(state_dict) return x
def save_checkpoint(state, is_best, task_id, filename="checkpoints/"): del_file(filename + str(int(task_id) - 1)) flow.save(state["state_dict"], filename + task_id) if is_best: file_path = "checkpoints/model_best" del_file(file_path) shutil.copytree(filename + task_id, file_path)
def save_model(subdir): if not args.model_save_dir: return save_path = os.path.join(args.model_save_dir, subdir) if rank == 0: print(f"Saving model to {save_path}") state_dict = deepfm_module.state_dict() flow.save(state_dict, save_path, global_dst_rank=0)
def train(opt): with open(opt.label_dict, "r") as f: lab_dict = json.load(f) cnn = simple_CNN(opt.num_speakers) cnn.to("cuda") cost = nn.CrossEntropyLoss() cost.to("cuda") optimizer = optim.RMSprop(cnn.parameters(), lr=opt.lr, alpha=opt.alpha, eps=opt.eps) output_folder = opt.output_path N_batches = opt.N_batches N_epoches = opt.N_epoches for epoch in range(N_epoches): cnn.train() loss_sum = 0 err_sum = 0 for i in range(N_batches): inp, lab = create_batches_rnd( lab_dict, batch_size=opt.batch_size, wlen=opt.wlen, fact_amp=opt.fact_amp, train=True, ) inp = inp.unsqueeze(1) lab -= 1 pout = cnn(inp) pred = flow.argmax(pout, dim=1) loss = cost(pout, lab.long()) err = np.mean(pred.numpy() != lab.long().numpy()) loss.backward() optimizer.step() optimizer.zero_grad() loss_sum = loss_sum + loss.detach() err_sum = err_sum + err loss_tot = loss_sum / N_batches err_tot = err_sum / N_batches if epoch % 10 == 0: print("epoch %i, loss_tr=%f err_tr=%f" % (epoch, loss_tot.numpy(), err_tot)) flow.save(cnn.state_dict(), os.path.join(output_folder, "CNN_model"))
def save(self, subdir): if self.args.checkpoint_save_path is None: return save_path = os.path.join(self.args.checkpoint_save_path, subdir) print_rank_0(f"Saving model to {save_path}") state_dict = self.model.state_dict() flow.save(state_dict, save_path, consistent_dst_rank=0)
def save(self, subdir): if self.save_path is None: return save_path = os.path.join(self.save_path, subdir) state_dict = self.model.state_dict() flow.save(state_dict, save_path, consistent_dst_rank=0) if self.rank in [-1, 0]: print(f"Saving model to {save_path}") return
def test_save_and_load_consistent_from_nested_dict(test_case): class CustomModule(flow.nn.Module): def __init__(self): super().__init__() self.param = flow.nn.Parameter(flow.randn(3, 32, 3, 3)) def forward(self): return self.param m1 = CustomModule() m1 = m1.to_consistent(flow.placement("cuda", {0: range(2)}), flow.sbp.broadcast) m2 = CustomModule() m2 = m2.to_consistent(flow.placement("cuda", {0: range(2)}), flow.sbp.broadcast) res1 = m1() + m2() state_dict1 = m1.state_dict() state_dict2 = m2.state_dict() state_dict = {"m1": state_dict1, "m2": state_dict2} with tempfile.TemporaryDirectory() as f: with test_case.assertRaises(Exception): flow.save(state_dict, f) consistent_src_dst_rank = 0 flow.save(state_dict, f, consistent_dst_rank=consistent_src_dst_rank) rank = flow.env.get_rank() if rank != consistent_src_dst_rank: test_case.assertEqual(len(os.listdir(f)), 0) m1 = CustomModule() m1 = m1.to_consistent(flow.placement("cuda", {0: range(2)}), flow.sbp.broadcast) m2 = CustomModule() m2 = m2.to_consistent(flow.placement("cuda", {0: range(2)}), flow.sbp.broadcast) with test_case.assertRaises(Exception): loaded_state_dict = flow.load(f) m1.load_state_dict(loaded_state_dict["m1"]) loaded_state_dict = flow.load( f, consistent_src_rank=consistent_src_dst_rank) test_case.assertEqual(len(loaded_state_dict), 2) m1.load_state_dict(loaded_state_dict["m1"]) m2.load_state_dict(loaded_state_dict["m2"]) res2 = m1() + m2() test_case.assertTrue( np.array_equal( res1.to_consistent(sbp=flow.sbp.broadcast).to_local().numpy(), res2.to_consistent(sbp=flow.sbp.broadcast).to_local().numpy(), ))
def main(args): x_train_dir, y_train_dir = get_datadir_path(args, split="train") if not os.path.exists(args.save_checkpoint_path): os.mkdir(args.save_checkpoint_path) train_dataset = Dataset( x_train_dir, y_train_dir, augmentation=get_training_augmentation(), ) batch_size = args.train_batch_size train_loader = flow_data.DataLoader(train_dataset, batch_size, shuffle=True) net = UNet(n_channels=3, n_classes=1) net.to("cuda") lr = args.learning_rate optimizer = flow.optim.RMSprop(net.parameters(), lr, weight_decay=args.weight_decay) criterion = nn.BCELoss() epoch = args.epochs num_steps = len(train_loader) for i in range(epoch): net.train() epoch_loss = 0 for step, data in enumerate(train_loader): images, labels = data images = images.permute(0, 3, 1, 2) images = images / 255.0 images = images.to("cuda", dtype=flow.float32) labels = labels.to("cuda", dtype=flow.float32) pred = net(images) loss = criterion(pred, labels) epoch_loss += loss.numpy() optimizer.zero_grad() loss.backward() optimizer.step() lr = optimizer.param_groups[0]["lr"] print("Train:[%d/%d][%d/%d] Training Loss: %.4f Lr: %.6f" % ((i + 1), args.epochs, step, num_steps, loss.numpy(), lr)) filename = "UNetmodel_Epoch_" + str(i) save_checkpoint_path = args.save_checkpoint_path flow.save(net.state_dict(), os.path.join(save_checkpoint_path, filename)) print("save net successfully!")
def save(self, epoch, file_path="checkpoints/"): """ Saving the current model on file_path :param epoch: current epoch number :param file_path: model output path which gonna be file_path+"ep%d" % epoch :return: final_output_path """ output_path = file_path + "epoch%d" % epoch flow.save(self.model.state_dict(), output_path) print("EP:%d Model Saved on:" % epoch, output_path) return output_path
def save(self, subdir): if self.save_path is None or self.save_path == "": return save_path = os.path.join(self.save_path, subdir) if self.rank == 0: print(f"Saving model to {save_path}") state_dict = self.wdl_module.state_dict() if self.is_global: flow.save(state_dict, save_path, global_dst_rank=0) elif self.rank == 0: flow.save(state_dict, save_path) else: return
def __call__(self, global_step, epoch, backbone, is_consistent=False): if global_step > 100 and backbone is not None: path_module = os.path.join(self.output, "epoch_%d" % (epoch)) if is_consistent: flow.save(backbone.state_dict(), path_module, consistent_dst_rank=0) else: if self.rank == 0: flow.save(backbone.state_dict(), path_module) logging.info("oneflow Model Saved in '{}'".format(path_module))
def save(self, subdir): if self.save_path is None: return save_path = os.path.join(self.save_path, subdir) self.logger.print(f"Saving model to {save_path}", print_ranks=[0]) state_dict = self.model.state_dict() if self.is_global: flow.save(state_dict, save_path, global_dst_rank=0) elif self.rank == 0: flow.save(state_dict, save_path) else: return
def train_by_oneflow(): x = Parameter(flow.Tensor(init_value, device=flow.device(device))) param_list = list() param_list.append(x) rmsprop = flow.optim.RMSprop([{ "params": param_list, "lr": learning_rate, "alpha": alpha, "eps": eps, "weight_decay": weight_decay, "momentum": momentum, "centered": centered, "clip_grad_max_norm": clip_grad_max_norm, "clip_grad_norm_type": clip_grad_norm_type, }]) def train_one_iter(grad): grad_tensor = flow.tensor( grad, dtype=flow.float32, requires_grad=False, device=flow.device(device), ) loss = flow.sum(x * grad_tensor) loss.backward() rmsprop.clip_grad() rmsprop.step() rmsprop.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) if i == reload_state_step: state_dict = rmsprop.state_dict() rmsprop = flow.optim.RMSprop([x]) if save_load_by_pickle: with tempfile.TemporaryDirectory() as save_dir: flow.save(state_dict, save_dir) state_dict = flow.load(save_dir) rmsprop.load_state_dict(state_dict) return x
def train( model, device, train_data, dev_data, loss_func, optimizer, epochs, train_batch_size, eval_batch_size, save_path, ): global_acc = float("-inf") for i in range(epochs): x_batch, y_batch = batch_loader(train_data[0], train_data[1], train_batch_size) model.train() model.training = True training_loss = 0 all_res, all_ground_truths = [], [] total_correct = 0 total_wrongs = 0 for idx, (data, label) in enumerate( tqdm(zip(x_batch, y_batch), total=len(x_batch))): data = data.to(device) label = label.to(device) logits = model(data) res = flow.argmax(logits, dim=1) total_correct += (res.numpy() == label.numpy()).sum() all_res.append(res) all_ground_truths.append(label) label = flow.tensor(np.eye(2)[label.numpy()], dtype=flow.float32).to(device) loss = loss_func(logits, label) training_loss += loss.numpy() loss.backward() optimizer.step() optimizer.zero_grad() all_ground_truths = flow.cat(all_ground_truths) train_acc = total_correct / len(all_ground_truths.numpy()) acc = _eval(model, dev_data, device, eval_batch_size) if acc > global_acc: global_acc = acc if os.path.exists(save_path): shutil.rmtree(save_path) flow.save(model.state_dict(), save_path) print( f"[Epoch{i}] training loss: {training_loss/(idx+1)} training accuracy: {train_acc} evaluation accuracy: {acc}" )
def save_checkpoint( state: dict, experiment_dir: Path, is_best: bool = False, filename="checkpoint", ): file_path: Path = experiment_dir / filename with safe_delete(file_path): flow.save(state, str(file_path)) _logger.info("save checkpoint: %s", file_path) if is_best: best_file_path = experiment_dir / "model_best" with safe_delete(best_file_path): shutil.copytree(file_path, best_file_path) _logger.info("save best checkpoint: %s", best_file_path)
def main(): print("Generating data...", end="") voc_size = args.vocab_sz inp = np.arange(2, voc_size, 2) tgt = np.arange(3, voc_size, 2) data_x, data_y = get_numbers(inp, tgt) train_len = int(len(data_x) * 0.9) train_x, val_x = data_x[:train_len], data_x[train_len:] train_y, val_y = data_y[:train_len], data_y[train_len:] print("Done") print("Setting model...", end="") model = TransformerModel( input_sz=voc_size, output_sz=voc_size, d_model=args.d_model, nhead=args.n_head, num_encoder_layers=args.n_encoder_layers, num_decoder_layers=args.n_decoder_layers, dim_feedforward=args.dim_feedforward, dropout=args.dropout, ) if args.load_dir != ".": model.load_state_dict(flow.load(args.load_dir)) model = to_cuda(model) criterion = to_cuda(nn.CrossEntropyLoss()) optimizer = flow.optim.Adam(model.parameters(), lr=args.lr) print("Done") print("Training...") min_loss = 100 for i in range(1, args.n_epochs + 1): epoch_loss = train(model, criterion, optimizer, train_x, train_y) epoch_loss_val = validation(model, criterion, val_x, val_y) print("epoch: {} train loss: {}".format(i, epoch_loss)) print("epoch: {} val loss: {}".format(i, epoch_loss_val)) if epoch_loss < min_loss: if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) else: shutil.rmtree(args.save_dir) assert not os.path.exists(args.save_dir) os.mkdir(args.save_dir) flow.save(model.state_dict(), args.save_dir) if i % 3 == 2: print(test(model, test_times=10))
def train_by_oneflow(): x = flow.nn.Parameter( flow.Tensor(init_value, device=flow.device(device))) optim_kwargs = { "params": [x], "lr": learning_rate, "betas": betas, "eps": eps, "weight_decay": weight_decay, "adam_w_mode": adam_w_mode, "do_bias_correction": do_bias_correction, } if clip_grad_max_norm != -1: optim_kwargs["clip_grad_max_norm"] = clip_grad_max_norm optim_kwargs["clip_grad_norm_type"] = clip_grad_norm_type lamb = flow.optim.LAMB([optim_kwargs]) def train_one_iter(grad): grad_tensor = flow.tensor( grad, dtype=flow.float32, requires_grad=False, device=flow.device(device), ) loss = flow.sum(x * grad_tensor) loss.backward() if clip_grad_max_norm != -1: lamb.clip_grad() lamb.step() lamb.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) if i == reload_state_step: state_dict = lamb.state_dict() lamb = flow.optim.LAMB([optim_kwargs]) if save_load_by_pickle: with tempfile.TemporaryDirectory() as save_dir: flow.save(state_dict, save_dir) state_dict = flow.load(save_dir) lamb.load_state_dict(state_dict) return x
def train_by_oneflow(): x = Parameter(flow.Tensor(init_value, device=flow.device(device))) adam = flow.optim.Adam( [ { "params": [x], "lr": learning_rate, "betas": betas, "eps": eps, "weight_decay": weight_decay, "clip_grad_max_norm": clip_grad_max_norm, "clip_grad_norm_type": clip_grad_norm_type, } ], do_bias_correction=do_bias_correction, amsgrad=amsgrad, ) def train_one_iter(grad): grad_tensor = flow.tensor( grad, dtype=flow.float32, requires_grad=False, device=flow.device(device), ) loss = flow.sum(x * grad_tensor) loss.backward() adam.clip_grad() adam.step() adam.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) if i == reload_state_step: state_dict = adam.state_dict() adam = flow.optim.Adam( [{"params": [x],}], do_bias_correction=do_bias_correction, ) if save_load_by_pickle: with tempfile.TemporaryDirectory() as save_dir: flow.save(state_dict, save_dir) state_dict = flow.load(save_dir) adam.load_state_dict(state_dict) return x
def convert_pt_checkpoint_to_of( model, pt_checkpoint_path="gpt2-pytorch_model.bin", of_checkpoint_path="gpt2_oneflow_model", ): import torch parameters = torch.load(pt_checkpoint_path) new_parameters = {} keys_to_ignore = [ "transformer.h.0.attn.bias", "transformer.h.0.attn.masked_bias", "transformer.h.1.attn.bias", "transformer.h.1.attn.masked_bias", "transformer.h.2.attn.bias", "transformer.h.2.attn.masked_bias", "transformer.h.3.attn.bias", "transformer.h.3.attn.masked_bias", "transformer.h.4.attn.bias", "transformer.h.4.attn.masked_bias", "transformer.h.5.attn.bias", "transformer.h.5.attn.masked_bias", "transformer.h.6.attn.bias", "transformer.h.6.attn.masked_bias", "transformer.h.7.attn.bias", "transformer.h.7.attn.masked_bias", "transformer.h.8.attn.bias", "transformer.h.8.attn.masked_bias", "transformer.h.9.attn.bias", "transformer.h.9.attn.masked_bias", "transformer.h.10.attn.bias", "transformer.h.10.attn.masked_bias", "transformer.h.11.attn.bias", "transformer.h.11.attn.masked_bias", ] for key, value in parameters.items(): if key in keys_to_ignore: continue if "num_batches_tracked" not in key: val = value.detach().cpu().numpy() new_parameters[key] = val model.load_state_dict(new_parameters, strict=False) # model.tie_embeddings() flow.save(model.state_dict(), of_checkpoint_path)
def save_pretrained(self, save_directory: str): """ Save a model file to a directory Arguments: save_directory: directory to which to save. """ assert os.path.isdir( save_directory ), "Saving path should be a directory where the model can be saved" # Only save the model itself if we are using distributed training model_to_save = self.module if hasattr(self, "module") else self # If we save using the predefined names, # we can load using `from_pretrained` # output_model_file = os.path.join(save_directory, self.weights_name) flow.save(model_to_save.state_dict(), save_directory) logger.info("Model weights saved in {}".format(save_directory))
def test_save_and_load(self): placement_arg = { "placement": flow.placement("cuda", ranks=[0]), "sbp": flow.sbp.broadcast, } graph = InferGraph(placement_arg) image_placeholder = flow.empty( (1, 3, 224, 224), dtype=flow.float32, placement=flow.placement("cpu", ranks=[0]), sbp=flow.sbp.broadcast, ) graph._compile(image_placeholder) saved_path = os.path.join("saved_model", graph.name) if not os.path.exists(saved_path): os.makedirs(saved_path) flow.save(graph, saved_path) saved_ir_path = os.path.join(saved_path, "model.mlir") serialized_job = oneflow._oneflow_internal.nn.graph.LoadSerializedJobFromIR( saved_ir_path) job = job_pb.Job() job.ParseFromString(serialized_job) op_list = [] op_list_ = [] for op in job.net.op: op_list.append(op) for op in graph._forward_job_proto.net.op: op_list_.append(op) def sort_by_op_name(op): return op.name op_list.sort(key=sort_by_op_name) op_list_.sort(key=sort_by_op_name) for (op, op_) in zip(op_list, op_list_): # TODO: convert loc in MLIR op_.ClearField("loc") self.assertTrue(op == op_, {"op": op, "op_": op_})
def train(self, n_iterations): start = time.time() for iteration in range(n_iterations): if iteration >= self.config["annealing_iters"]: lambda_kl = self.config["lambda"]["lambda_kl"] else: lambda_kl = (self.config["lambda"]["lambda_kl"] * (iteration + 1) / self.config["annealing_iters"]) data = next(self.train_iter) meta = self.ae_step(data, lambda_kl) if iteration % self.args.summary_steps == 0: print( "Iter {0} | loss_kl {1:.3f} | " "loss_rec {2:.3f} | loss {3:.3f}".format( iteration, meta["loss_kl"], meta["loss_rec"], meta["loss"], ), flush=True, ) if (iteration + 1 ) % self.args.save_steps == 0 or iteration + 1 == n_iterations: file_path = os.path.join( self.args.store_model_path, "iteration%d.pth.tar" % (iteration + 1)) flow.save(self.model.state_dict(), file_path) print("Saving checkpoint model to %s" % file_path) for dirs in os.listdir(self.args.store_model_path): dir_name = os.path.join(self.args.store_model_path, dirs) dir = dir_name.split("/")[-1] dir = re.findall(r"\d+", dir) if dir == []: dir = 100000000 else: dir = int(dir[0]) if (iteration + 1) - dir >= 24999: shutil.rmtree(dir_name) print("Train Time {0:.2f}s".format(time.time() - start)) return
def test_warmup_scheduler_save_and_load(test_case): param = flow.nn.Parameter(flow.ones(3, 4)) optimizer = flow.optim.SGD([param]) cosine_scheduler = flow.optim.lr_scheduler.CosineAnnealingLR( optimizer, 100) lr_scheduler = flow.optim.lr_scheduler.WarmUpLR( cosine_scheduler, warmup_factor=0.1, warmup_iters=5, warmup_method="linear", ) for _ in range(random.randint(1, 10)): lr_scheduler.step() # save with tempfile.TemporaryDirectory() as save_dir: flow.save(lr_scheduler.state_dict(), save_dir) state_dict = flow.load(save_dir) # load param2 = flow.nn.Parameter(flow.ones(3, 4)) optimizer2 = flow.optim.SGD([param]) cosine_scheduler2 = flow.optim.lr_scheduler.CosineAnnealingLR( optimizer, 50) lr_scheduler2 = flow.optim.lr_scheduler.WarmUpLR( cosine_scheduler2, warmup_factor=0.5, warmup_iters=10, warmup_method="linear", ) lr_scheduler2.load_state_dict(state_dict) # compare warm up scheduler for attr in [ "warmup_iters", "warmup_factor", "warmup_method", "last_step" ]: test_case.assertEqual(getattr(lr_scheduler, attr), getattr(lr_scheduler2, attr)) # compare cosine_annealing_lr for attr in ["T_max", "eta_min", "last_step"]: test_case.assertEqual(getattr(cosine_scheduler, attr), getattr(cosine_scheduler2, attr))