예제 #1
0
def convert_func(cfg, model_path, out_path, image_size):

    model_module = get_model(cfg.network,
                             dropout=0.0,
                             num_features=cfg.embedding_size).to("cuda")
    model_module.eval()
    print(model_module)
    model_graph = ModelGraph(model_module)
    model_graph._compile(flow.randn(1, 3, image_size, image_size).to("cuda"))

    with tempfile.TemporaryDirectory() as tmpdirname:
        new_parameters = dict()
        parameters = flow.load(model_path)
        for key, value in parameters.items():
            if "num_batches_tracked" not in key:
                if key == "fc.weight":
                    continue
                val = value
                new_key = key.replace("backbone.", "")
                new_parameters[new_key] = val
        model_module.load_state_dict(new_parameters)
        flow.save(model_module.state_dict(), tmpdirname)
        convert_to_onnx_and_check(model_graph,
                                  flow_weight_dir=tmpdirname,
                                  onnx_model_path="./",
                                  print_outlier=True)
예제 #2
0
def verify_math(
    model,
    name="",
    rtol=1e-5,
    atol=1e-5,
    inputs=flow.tensor(
        np.random.rand(100, 1),
        dtype=flow.float32,
    ),
    device="llvm",
):
    """verify_math"""
    if device == "cuda":
        model.to(device)
        inputs = inputs.to(device)

    graph = OneFlowGraph(model)
    graph._compile(inputs)

    mkdir(MODEL_HOME)
    flow.save(model.state_dict(), MODEL_HOME)

    out_flow = get_oneflow_output(graph, inputs)
    out_tvm = get_tvm_output(graph, MODEL_HOME, inputs, target=device)
    rmdir(MODEL_HOME)

    assert_shape(out_flow, out_tvm)
    tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
예제 #3
0
def verify_concat(
    model,
    name="",
    rtol=1e-5,
    atol=1e-5,
    inputs1=flow.tensor(np.random.randn(2, 5, 5, 4), dtype=flow.float32),
    inputs2=flow.tensor(np.random.randn(2, 5, 5, 2), dtype=flow.float32),
    inputs3=flow.tensor(np.random.randn(2, 5, 5, 3), dtype=flow.float32),
    device="llvm",
):
    """verify_concat"""
    if device == "cuda":
        model.to(device)
        inputs1 = inputs1.to(device)
        inputs2 = inputs2.to(device)
        inputs3 = inputs3.to(device)

    graph = OneFlowGraphV2(model)
    graph._compile(inputs1, inputs2, inputs3)

    mkdir(MODEL_HOME)
    flow.save(model.state_dict(), MODEL_HOME)

    out_flow = get_oneflow_concat_output(graph, inputs1, inputs2, inputs3)
    out_tvm = get_tvm_concat_output(graph,
                                    MODEL_HOME,
                                    inputs1,
                                    inputs2,
                                    inputs3,
                                    target=device)
    rmdir(MODEL_HOME)

    assert_shape(out_flow, out_tvm)
    tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
예제 #4
0
    def train_by_oneflow():
        x = Parameter(flow.Tensor(init_value, device=flow.device(device)))
        sgd = flow.optim.SGD([{
            "params": [x],
            "lr": learning_rate,
            "momentum": momentum,
            "weight_decay": weight_decay,
        }])

        def train_one_iter(grad):
            grad_tensor = flow.tensor(
                grad,
                dtype=flow.float32,
                requires_grad=False,
                device=flow.device(device),
            )
            loss = flow.sum(x * grad_tensor)
            loss.backward()
            sgd.step()
            sgd.zero_grad()

        for i in range(train_iters):
            train_one_iter(random_grad_seq[i])
            # test state_dict/load_state_dict
            if i == reload_state_step:
                state_dict = sgd.state_dict()
                sgd = flow.optim.SGD([x])
                if save_load_by_pickle:
                    with tempfile.TemporaryDirectory() as save_dir:
                        flow.save(state_dict, save_dir)
                        state_dict = flow.load(save_dir)
                sgd.load_state_dict(state_dict)
        return x
예제 #5
0
def convert_weights(
    model_type: str,
    from_model: str,
    from_path: str,
    config_path: str,
    to_model: str,
    dump_path: str,
):
    model_class = MODEL_CLASSES[to_model][model_type]
    config = ConfigBase(config_path)
    model = model_class(config)
    load_weights_fct = LOAD_WEIGHTS_MAPS[to_model][model_type][from_model]
    if to_model == "tf":
        input_ids = tf.ones([3, 4], dtype=tf.int32)
        model(input_ids)
    load_weights_fct(model, config, from_path)

    if to_model == "pt":
        torch.save(model.state_dict(), dump_path)
    elif to_model == "tf":
        model.save_weights(dump_path)
    elif to_model == "ms":
        mindspore.save_checkpoint(model, dump_path)
    elif to_model == "of":
        flow.save(model.state_dict(), dump_path)
    elif to_model == "pd":
        paddle.save(model.state_dict(), dump_path)
    print("Save {} model to {}".format(to_model, dump_path))
예제 #6
0
def verify_matmul(
    model,
    name="",
    rtol=1e-5,
    atol=1e-5,
    inputs1=flow.tensor(np.random.randn(2, 5), dtype=flow.float32),
    inputs2=flow.tensor(np.random.randn(5, 2), dtype=flow.float32),
    device="llvm",
):
    if device == "cuda":
        model.to(device)
        inputs1 = inputs1.to(device)
        inputs2 = inputs2.to(device)

    graph = OneFlowGraph_v3(model)
    graph._compile(inputs1, inputs2)
    mkdir(MODEL_HOME)
    flow.save(model.state_dict(), MODEL_HOME)

    out_flow = get_oneflow_elementwise_output(graph, inputs1, inputs2)
    out_tvm = get_tvm_elementwise_output(graph,
                                         MODEL_HOME,
                                         inputs1,
                                         inputs2,
                                         target=device)
    rmdir(MODEL_HOME)

    assert_shape(out_flow, out_tvm)
    tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
예제 #7
0
 def save_checkpoint(self, params, name):
     flow.save(params, os.path.join(name, "params.tar"))
     flow.save(self.frontend.state_dict(), os.path.join(name, "frontend.pt"))
     flow.save(self.encoder.state_dict(), os.path.join(name, "encoder.pt"))
     flow.save(self.decoder.state_dict(), os.path.join(name, "decoder.pt"))
     if self.ctc_weight > 0.0:
         flow.save(self.assistor.state_dict(), os.path.join(name, "ctc.pt"))
예제 #8
0
    def train_by_oneflow():
        x = Parameter(flow.Tensor(init_value, device=flow.device(device)))
        adagrad = flow.optim.Adagrad(
            [
                {
                    "params": [x],
                    "lr": learning_rate,
                    "eps": eps,
                    "weight_decay": weight_decay,
                }
            ],
            lr_decay=lr_decay,
            initial_accumulator_value=initial_accumulator_value,
        )

        def train_one_iter(grad):
            grad_tensor = flow.tensor(
                grad, requires_grad=False, device=flow.device(device)
            )
            loss = flow.sum(x * grad_tensor)
            loss.backward()
            adagrad.step()
            adagrad.zero_grad()

        for i in range(train_iters):
            train_one_iter(random_grad_seq[i])
            if i == reload_state_step:
                state_dict = adagrad.state_dict()
                adagrad = flow.optim.Adagrad([x])
                if save_load_by_pickle:
                    with tempfile.TemporaryDirectory() as save_dir:
                        flow.save(state_dict, save_dir)
                        state_dict = flow.load(save_dir)
                adagrad.load_state_dict(state_dict)
        return x
예제 #9
0
def save_checkpoint(state, is_best, task_id, filename="checkpoints/"):

    del_file(filename + str(int(task_id) - 1))
    flow.save(state["state_dict"], filename + task_id)
    if is_best:
        file_path = "checkpoints/model_best"
        del_file(file_path)
        shutil.copytree(filename + task_id, file_path)
예제 #10
0
 def save_model(subdir):
     if not args.model_save_dir:
         return
     save_path = os.path.join(args.model_save_dir, subdir)
     if rank == 0:
         print(f"Saving model to {save_path}")
     state_dict = deepfm_module.state_dict()
     flow.save(state_dict, save_path, global_dst_rank=0)
예제 #11
0
파일: train.py 프로젝트: Oneflow-Inc/models
def train(opt):
    with open(opt.label_dict, "r") as f:
        lab_dict = json.load(f)

    cnn = simple_CNN(opt.num_speakers)
    cnn.to("cuda")

    cost = nn.CrossEntropyLoss()
    cost.to("cuda")

    optimizer = optim.RMSprop(cnn.parameters(),
                              lr=opt.lr,
                              alpha=opt.alpha,
                              eps=opt.eps)

    output_folder = opt.output_path
    N_batches = opt.N_batches
    N_epoches = opt.N_epoches

    for epoch in range(N_epoches):
        cnn.train()

        loss_sum = 0
        err_sum = 0

        for i in range(N_batches):

            inp, lab = create_batches_rnd(
                lab_dict,
                batch_size=opt.batch_size,
                wlen=opt.wlen,
                fact_amp=opt.fact_amp,
                train=True,
            )
            inp = inp.unsqueeze(1)
            lab -= 1

            pout = cnn(inp)
            pred = flow.argmax(pout, dim=1)
            loss = cost(pout, lab.long())
            err = np.mean(pred.numpy() != lab.long().numpy())

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            loss_sum = loss_sum + loss.detach()
            err_sum = err_sum + err

        loss_tot = loss_sum / N_batches
        err_tot = err_sum / N_batches

        if epoch % 10 == 0:
            print("epoch %i, loss_tr=%f err_tr=%f" %
                  (epoch, loss_tot.numpy(), err_tot))

    flow.save(cnn.state_dict(), os.path.join(output_folder, "CNN_model"))
예제 #12
0
파일: train.py 프로젝트: Oneflow-Inc/models
    def save(self, subdir):
        if self.args.checkpoint_save_path is None:
            return

        save_path = os.path.join(self.args.checkpoint_save_path, subdir)
        print_rank_0(f"Saving model to {save_path}")
        state_dict = self.model.state_dict()

        flow.save(state_dict, save_path, consistent_dst_rank=0)
예제 #13
0
파일: train.py 프로젝트: Oneflow-Inc/models
    def save(self, subdir):
        if self.save_path is None:
            return

        save_path = os.path.join(self.save_path, subdir)
        state_dict = self.model.state_dict()
        flow.save(state_dict, save_path, consistent_dst_rank=0)
        if self.rank in [-1, 0]:
            print(f"Saving model to {save_path}")
        return
예제 #14
0
    def test_save_and_load_consistent_from_nested_dict(test_case):
        class CustomModule(flow.nn.Module):
            def __init__(self):
                super().__init__()
                self.param = flow.nn.Parameter(flow.randn(3, 32, 3, 3))

            def forward(self):
                return self.param

        m1 = CustomModule()
        m1 = m1.to_consistent(flow.placement("cuda", {0: range(2)}),
                              flow.sbp.broadcast)
        m2 = CustomModule()
        m2 = m2.to_consistent(flow.placement("cuda", {0: range(2)}),
                              flow.sbp.broadcast)
        res1 = m1() + m2()
        state_dict1 = m1.state_dict()
        state_dict2 = m2.state_dict()
        state_dict = {"m1": state_dict1, "m2": state_dict2}

        with tempfile.TemporaryDirectory() as f:
            with test_case.assertRaises(Exception):
                flow.save(state_dict, f)

            consistent_src_dst_rank = 0
            flow.save(state_dict,
                      f,
                      consistent_dst_rank=consistent_src_dst_rank)
            rank = flow.env.get_rank()
            if rank != consistent_src_dst_rank:
                test_case.assertEqual(len(os.listdir(f)), 0)

            m1 = CustomModule()
            m1 = m1.to_consistent(flow.placement("cuda", {0: range(2)}),
                                  flow.sbp.broadcast)
            m2 = CustomModule()
            m2 = m2.to_consistent(flow.placement("cuda", {0: range(2)}),
                                  flow.sbp.broadcast)

            with test_case.assertRaises(Exception):
                loaded_state_dict = flow.load(f)
                m1.load_state_dict(loaded_state_dict["m1"])

            loaded_state_dict = flow.load(
                f, consistent_src_rank=consistent_src_dst_rank)
            test_case.assertEqual(len(loaded_state_dict), 2)
            m1.load_state_dict(loaded_state_dict["m1"])
            m2.load_state_dict(loaded_state_dict["m2"])
            res2 = m1() + m2()

        test_case.assertTrue(
            np.array_equal(
                res1.to_consistent(sbp=flow.sbp.broadcast).to_local().numpy(),
                res2.to_consistent(sbp=flow.sbp.broadcast).to_local().numpy(),
            ))
예제 #15
0
파일: train.py 프로젝트: Oneflow-Inc/models
def main(args):
    x_train_dir, y_train_dir = get_datadir_path(args, split="train")
    if not os.path.exists(args.save_checkpoint_path):
        os.mkdir(args.save_checkpoint_path)

    train_dataset = Dataset(
        x_train_dir,
        y_train_dir,
        augmentation=get_training_augmentation(),
    )
    batch_size = args.train_batch_size
    train_loader = flow_data.DataLoader(train_dataset,
                                        batch_size,
                                        shuffle=True)

    net = UNet(n_channels=3, n_classes=1)
    net.to("cuda")

    lr = args.learning_rate
    optimizer = flow.optim.RMSprop(net.parameters(),
                                   lr,
                                   weight_decay=args.weight_decay)

    criterion = nn.BCELoss()
    epoch = args.epochs
    num_steps = len(train_loader)
    for i in range(epoch):

        net.train()
        epoch_loss = 0

        for step, data in enumerate(train_loader):
            images, labels = data
            images = images.permute(0, 3, 1, 2)
            images = images / 255.0
            images = images.to("cuda", dtype=flow.float32)
            labels = labels.to("cuda", dtype=flow.float32)

            pred = net(images)
            loss = criterion(pred, labels)
            epoch_loss += loss.numpy()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            lr = optimizer.param_groups[0]["lr"]
            print("Train:[%d/%d][%d/%d] Training Loss: %.4f Lr: %.6f" %
                  ((i + 1), args.epochs, step, num_steps, loss.numpy(), lr))
        filename = "UNetmodel_Epoch_" + str(i)
        save_checkpoint_path = args.save_checkpoint_path
        flow.save(net.state_dict(), os.path.join(save_checkpoint_path,
                                                 filename))
        print("save net successfully!")
예제 #16
0
    def save(self, epoch, file_path="checkpoints/"):
        """
        Saving the current model on file_path

        :param epoch: current epoch number
        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
        :return: final_output_path
        """
        output_path = file_path + "epoch%d" % epoch
        flow.save(self.model.state_dict(), output_path)
        print("EP:%d Model Saved on:" % epoch, output_path)
        return output_path
예제 #17
0
 def save(self, subdir):
     if self.save_path is None or self.save_path == "":
         return
     save_path = os.path.join(self.save_path, subdir)
     if self.rank == 0:
         print(f"Saving model to {save_path}")
     state_dict = self.wdl_module.state_dict()
     if self.is_global:
         flow.save(state_dict, save_path, global_dst_rank=0)
     elif self.rank == 0:
         flow.save(state_dict, save_path)
     else:
         return
예제 #18
0
    def __call__(self, global_step, epoch, backbone, is_consistent=False):

        if global_step > 100 and backbone is not None:
            path_module = os.path.join(self.output, "epoch_%d" % (epoch))

            if is_consistent:
                flow.save(backbone.state_dict(),
                          path_module,
                          consistent_dst_rank=0)
            else:
                if self.rank == 0:
                    flow.save(backbone.state_dict(), path_module)
            logging.info("oneflow Model Saved in '{}'".format(path_module))
예제 #19
0
    def save(self, subdir):
        if self.save_path is None:
            return

        save_path = os.path.join(self.save_path, subdir)
        self.logger.print(f"Saving model to {save_path}", print_ranks=[0])
        state_dict = self.model.state_dict()

        if self.is_global:
            flow.save(state_dict, save_path, global_dst_rank=0)
        elif self.rank == 0:
            flow.save(state_dict, save_path)
        else:
            return
예제 #20
0
    def train_by_oneflow():
        x = Parameter(flow.Tensor(init_value, device=flow.device(device)))
        param_list = list()
        param_list.append(x)
        rmsprop = flow.optim.RMSprop([{
            "params":
            param_list,
            "lr":
            learning_rate,
            "alpha":
            alpha,
            "eps":
            eps,
            "weight_decay":
            weight_decay,
            "momentum":
            momentum,
            "centered":
            centered,
            "clip_grad_max_norm":
            clip_grad_max_norm,
            "clip_grad_norm_type":
            clip_grad_norm_type,
        }])

        def train_one_iter(grad):
            grad_tensor = flow.tensor(
                grad,
                dtype=flow.float32,
                requires_grad=False,
                device=flow.device(device),
            )
            loss = flow.sum(x * grad_tensor)
            loss.backward()
            rmsprop.clip_grad()
            rmsprop.step()
            rmsprop.zero_grad()

        for i in range(train_iters):
            train_one_iter(random_grad_seq[i])
            if i == reload_state_step:
                state_dict = rmsprop.state_dict()
                rmsprop = flow.optim.RMSprop([x])
                if save_load_by_pickle:
                    with tempfile.TemporaryDirectory() as save_dir:
                        flow.save(state_dict, save_dir)
                        state_dict = flow.load(save_dir)
                rmsprop.load_state_dict(state_dict)
        return x
예제 #21
0
def train(
    model,
    device,
    train_data,
    dev_data,
    loss_func,
    optimizer,
    epochs,
    train_batch_size,
    eval_batch_size,
    save_path,
):
    global_acc = float("-inf")
    for i in range(epochs):
        x_batch, y_batch = batch_loader(train_data[0], train_data[1],
                                        train_batch_size)
        model.train()
        model.training = True
        training_loss = 0
        all_res, all_ground_truths = [], []
        total_correct = 0
        total_wrongs = 0
        for idx, (data, label) in enumerate(
                tqdm(zip(x_batch, y_batch), total=len(x_batch))):
            data = data.to(device)
            label = label.to(device)
            logits = model(data)
            res = flow.argmax(logits, dim=1)
            total_correct += (res.numpy() == label.numpy()).sum()
            all_res.append(res)
            all_ground_truths.append(label)
            label = flow.tensor(np.eye(2)[label.numpy()],
                                dtype=flow.float32).to(device)
            loss = loss_func(logits, label)
            training_loss += loss.numpy()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        all_ground_truths = flow.cat(all_ground_truths)
        train_acc = total_correct / len(all_ground_truths.numpy())
        acc = _eval(model, dev_data, device, eval_batch_size)
        if acc > global_acc:
            global_acc = acc
            if os.path.exists(save_path):
                shutil.rmtree(save_path)
            flow.save(model.state_dict(), save_path)
        print(
            f"[Epoch{i}] training loss: {training_loss/(idx+1)}  training accuracy: {train_acc} evaluation accuracy: {acc}"
        )
예제 #22
0
def save_checkpoint(
    state: dict,
    experiment_dir: Path,
    is_best: bool = False,
    filename="checkpoint",
):
    file_path: Path = experiment_dir / filename
    with safe_delete(file_path):
        flow.save(state, str(file_path))
        _logger.info("save checkpoint: %s", file_path)
    if is_best:
        best_file_path = experiment_dir / "model_best"
        with safe_delete(best_file_path):
            shutil.copytree(file_path, best_file_path)
            _logger.info("save best checkpoint: %s", best_file_path)
예제 #23
0
def main():
    print("Generating data...", end="")
    voc_size = args.vocab_sz
    inp = np.arange(2, voc_size, 2)
    tgt = np.arange(3, voc_size, 2)
    data_x, data_y = get_numbers(inp, tgt)
    train_len = int(len(data_x) * 0.9)
    train_x, val_x = data_x[:train_len], data_x[train_len:]
    train_y, val_y = data_y[:train_len], data_y[train_len:]
    print("Done")

    print("Setting model...", end="")
    model = TransformerModel(
        input_sz=voc_size,
        output_sz=voc_size,
        d_model=args.d_model,
        nhead=args.n_head,
        num_encoder_layers=args.n_encoder_layers,
        num_decoder_layers=args.n_decoder_layers,
        dim_feedforward=args.dim_feedforward,
        dropout=args.dropout,
    )
    if args.load_dir != ".":
        model.load_state_dict(flow.load(args.load_dir))
    model = to_cuda(model)
    criterion = to_cuda(nn.CrossEntropyLoss())

    optimizer = flow.optim.Adam(model.parameters(), lr=args.lr)
    print("Done")

    print("Training...")

    min_loss = 100
    for i in range(1, args.n_epochs + 1):
        epoch_loss = train(model, criterion, optimizer, train_x, train_y)
        epoch_loss_val = validation(model, criterion, val_x, val_y)
        print("epoch: {} train loss: {}".format(i, epoch_loss))
        print("epoch: {} val loss: {}".format(i, epoch_loss_val))
        if epoch_loss < min_loss:
            if not os.path.exists(args.save_dir):
                os.mkdir(args.save_dir)
            else:
                shutil.rmtree(args.save_dir)
                assert not os.path.exists(args.save_dir)
                os.mkdir(args.save_dir)
            flow.save(model.state_dict(), args.save_dir)
        if i % 3 == 2:
            print(test(model, test_times=10))
예제 #24
0
    def train_by_oneflow():
        x = flow.nn.Parameter(
            flow.Tensor(init_value, device=flow.device(device)))

        optim_kwargs = {
            "params": [x],
            "lr": learning_rate,
            "betas": betas,
            "eps": eps,
            "weight_decay": weight_decay,
            "adam_w_mode": adam_w_mode,
            "do_bias_correction": do_bias_correction,
        }

        if clip_grad_max_norm != -1:
            optim_kwargs["clip_grad_max_norm"] = clip_grad_max_norm
            optim_kwargs["clip_grad_norm_type"] = clip_grad_norm_type

        lamb = flow.optim.LAMB([optim_kwargs])

        def train_one_iter(grad):
            grad_tensor = flow.tensor(
                grad,
                dtype=flow.float32,
                requires_grad=False,
                device=flow.device(device),
            )

            loss = flow.sum(x * grad_tensor)
            loss.backward()
            if clip_grad_max_norm != -1:
                lamb.clip_grad()
            lamb.step()
            lamb.zero_grad()

        for i in range(train_iters):
            train_one_iter(random_grad_seq[i])
            if i == reload_state_step:
                state_dict = lamb.state_dict()
                lamb = flow.optim.LAMB([optim_kwargs])
                if save_load_by_pickle:
                    with tempfile.TemporaryDirectory() as save_dir:
                        flow.save(state_dict, save_dir)
                        state_dict = flow.load(save_dir)
                lamb.load_state_dict(state_dict)
        return x
예제 #25
0
    def train_by_oneflow():
        x = Parameter(flow.Tensor(init_value, device=flow.device(device)))
        adam = flow.optim.Adam(
            [
                {
                    "params": [x],
                    "lr": learning_rate,
                    "betas": betas,
                    "eps": eps,
                    "weight_decay": weight_decay,
                    "clip_grad_max_norm": clip_grad_max_norm,
                    "clip_grad_norm_type": clip_grad_norm_type,
                }
            ],
            do_bias_correction=do_bias_correction,
            amsgrad=amsgrad,
        )

        def train_one_iter(grad):
            grad_tensor = flow.tensor(
                grad,
                dtype=flow.float32,
                requires_grad=False,
                device=flow.device(device),
            )
            loss = flow.sum(x * grad_tensor)
            loss.backward()
            adam.clip_grad()
            adam.step()
            adam.zero_grad()

        for i in range(train_iters):
            train_one_iter(random_grad_seq[i])
            if i == reload_state_step:
                state_dict = adam.state_dict()
                adam = flow.optim.Adam(
                    [{"params": [x],}], do_bias_correction=do_bias_correction,
                )
                if save_load_by_pickle:
                    with tempfile.TemporaryDirectory() as save_dir:
                        flow.save(state_dict, save_dir)
                        state_dict = flow.load(save_dir)
                adam.load_state_dict(state_dict)
        return x
예제 #26
0
def convert_pt_checkpoint_to_of(
    model,
    pt_checkpoint_path="gpt2-pytorch_model.bin",
    of_checkpoint_path="gpt2_oneflow_model",
):
    import torch

    parameters = torch.load(pt_checkpoint_path)
    new_parameters = {}
    keys_to_ignore = [
        "transformer.h.0.attn.bias",
        "transformer.h.0.attn.masked_bias",
        "transformer.h.1.attn.bias",
        "transformer.h.1.attn.masked_bias",
        "transformer.h.2.attn.bias",
        "transformer.h.2.attn.masked_bias",
        "transformer.h.3.attn.bias",
        "transformer.h.3.attn.masked_bias",
        "transformer.h.4.attn.bias",
        "transformer.h.4.attn.masked_bias",
        "transformer.h.5.attn.bias",
        "transformer.h.5.attn.masked_bias",
        "transformer.h.6.attn.bias",
        "transformer.h.6.attn.masked_bias",
        "transformer.h.7.attn.bias",
        "transformer.h.7.attn.masked_bias",
        "transformer.h.8.attn.bias",
        "transformer.h.8.attn.masked_bias",
        "transformer.h.9.attn.bias",
        "transformer.h.9.attn.masked_bias",
        "transformer.h.10.attn.bias",
        "transformer.h.10.attn.masked_bias",
        "transformer.h.11.attn.bias",
        "transformer.h.11.attn.masked_bias",
    ]
    for key, value in parameters.items():
        if key in keys_to_ignore:
            continue
        if "num_batches_tracked" not in key:
            val = value.detach().cpu().numpy()
            new_parameters[key] = val
    model.load_state_dict(new_parameters, strict=False)
    # model.tie_embeddings()
    flow.save(model.state_dict(), of_checkpoint_path)
예제 #27
0
    def save_pretrained(self, save_directory: str):
        """ Save a model file to a directory
            Arguments:
                save_directory: directory to which to save.
        """
        assert os.path.isdir(
            save_directory
        ), "Saving path should be a directory where the model can be saved"

        # Only save the model itself if we are using distributed training
        model_to_save = self.module if hasattr(self, "module") else self

        # If we save using the predefined names,
        # we can load using `from_pretrained`
        # output_model_file = os.path.join(save_directory, self.weights_name)

        flow.save(model_to_save.state_dict(), save_directory)

        logger.info("Model weights saved in {}".format(save_directory))
예제 #28
0
    def test_save_and_load(self):
        placement_arg = {
            "placement": flow.placement("cuda", ranks=[0]),
            "sbp": flow.sbp.broadcast,
        }
        graph = InferGraph(placement_arg)
        image_placeholder = flow.empty(
            (1, 3, 224, 224),
            dtype=flow.float32,
            placement=flow.placement("cpu", ranks=[0]),
            sbp=flow.sbp.broadcast,
        )
        graph._compile(image_placeholder)
        saved_path = os.path.join("saved_model", graph.name)
        if not os.path.exists(saved_path):
            os.makedirs(saved_path)
        flow.save(graph, saved_path)

        saved_ir_path = os.path.join(saved_path, "model.mlir")
        serialized_job = oneflow._oneflow_internal.nn.graph.LoadSerializedJobFromIR(
            saved_ir_path)
        job = job_pb.Job()
        job.ParseFromString(serialized_job)

        op_list = []
        op_list_ = []

        for op in job.net.op:
            op_list.append(op)

        for op in graph._forward_job_proto.net.op:
            op_list_.append(op)

        def sort_by_op_name(op):
            return op.name

        op_list.sort(key=sort_by_op_name)
        op_list_.sort(key=sort_by_op_name)

        for (op, op_) in zip(op_list, op_list_):
            # TODO: convert loc in MLIR
            op_.ClearField("loc")
            self.assertTrue(op == op_, {"op": op, "op_": op_})
예제 #29
0
    def train(self, n_iterations):
        start = time.time()
        for iteration in range(n_iterations):
            if iteration >= self.config["annealing_iters"]:
                lambda_kl = self.config["lambda"]["lambda_kl"]
            else:
                lambda_kl = (self.config["lambda"]["lambda_kl"] *
                             (iteration + 1) / self.config["annealing_iters"])

            data = next(self.train_iter)
            meta = self.ae_step(data, lambda_kl)

            if iteration % self.args.summary_steps == 0:
                print(
                    "Iter {0} | loss_kl {1:.3f} | "
                    "loss_rec {2:.3f} | loss {3:.3f}".format(
                        iteration,
                        meta["loss_kl"],
                        meta["loss_rec"],
                        meta["loss"],
                    ),
                    flush=True,
                )

            if (iteration + 1
                ) % self.args.save_steps == 0 or iteration + 1 == n_iterations:
                file_path = os.path.join(
                    self.args.store_model_path,
                    "iteration%d.pth.tar" % (iteration + 1))
                flow.save(self.model.state_dict(), file_path)
                print("Saving checkpoint model to %s" % file_path)
                for dirs in os.listdir(self.args.store_model_path):
                    dir_name = os.path.join(self.args.store_model_path, dirs)
                    dir = dir_name.split("/")[-1]
                    dir = re.findall(r"\d+", dir)
                    if dir == []:
                        dir = 100000000
                    else:
                        dir = int(dir[0])
                    if (iteration + 1) - dir >= 24999:
                        shutil.rmtree(dir_name)
        print("Train Time {0:.2f}s".format(time.time() - start))
        return
예제 #30
0
    def test_warmup_scheduler_save_and_load(test_case):
        param = flow.nn.Parameter(flow.ones(3, 4))

        optimizer = flow.optim.SGD([param])
        cosine_scheduler = flow.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, 100)
        lr_scheduler = flow.optim.lr_scheduler.WarmUpLR(
            cosine_scheduler,
            warmup_factor=0.1,
            warmup_iters=5,
            warmup_method="linear",
        )
        for _ in range(random.randint(1, 10)):
            lr_scheduler.step()
        # save
        with tempfile.TemporaryDirectory() as save_dir:
            flow.save(lr_scheduler.state_dict(), save_dir)
            state_dict = flow.load(save_dir)

        # load
        param2 = flow.nn.Parameter(flow.ones(3, 4))
        optimizer2 = flow.optim.SGD([param])
        cosine_scheduler2 = flow.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, 50)
        lr_scheduler2 = flow.optim.lr_scheduler.WarmUpLR(
            cosine_scheduler2,
            warmup_factor=0.5,
            warmup_iters=10,
            warmup_method="linear",
        )
        lr_scheduler2.load_state_dict(state_dict)

        # compare warm up scheduler
        for attr in [
                "warmup_iters", "warmup_factor", "warmup_method", "last_step"
        ]:
            test_case.assertEqual(getattr(lr_scheduler, attr),
                                  getattr(lr_scheduler2, attr))
        # compare cosine_annealing_lr
        for attr in ["T_max", "eta_min", "last_step"]:
            test_case.assertEqual(getattr(cosine_scheduler, attr),
                                  getattr(cosine_scheduler2, attr))