Exemplo n.º 1
0
    def test_torchmodel_constructor(self):
        class TwoInputModel(nn.Module):
            def __init__(self):
                super(TwoInputModel, self).__init__()
                self.dense1 = nn.Linear(2, 2)
                self.dense2 = nn.Linear(3, 1)

            def forward(self, x1, x2):
                x1 = self.dense1(x1)
                x2 = self.dense2(x2)
                return x1, x2

        TorchModel.from_pytorch(TwoInputModel())
Exemplo n.º 2
0
 def __init__(self,
              model,
              loss,
              optimizer,
              metrics=None,
              model_dir=None,
              bigdl_type="float"):
     from zoo.pipeline.api.torch import TorchModel, TorchLoss, TorchOptim
     self.loss = loss
     if self.loss is None:
         self.loss = TorchLoss()
     else:
         self.loss = TorchLoss.from_pytorch(loss)
     if optimizer is None:
         from zoo.orca.learn.optimizers.schedule import Default
         optimizer = SGD(learningrate_schedule=Default())
     if isinstance(optimizer, TorchOptimizer):
         optimizer = TorchOptim.from_pytorch(optimizer)
     elif isinstance(optimizer, OrcaOptimizer):
         optimizer = optimizer.get_optimizer()
     else:
         raise ValueError(
             "Only PyTorch optimizer and orca optimizer are supported")
     from zoo.orca.learn.metrics import Metric
     self.metrics = Metric.convert_metrics_list(metrics)
     self.log_dir = None
     self.app_name = None
     self.model_dir = model_dir
     self.model = TorchModel.from_pytorch(model)
     self.estimator = SparkEstimator(self.model,
                                     optimizer,
                                     model_dir,
                                     bigdl_type=bigdl_type)
Exemplo n.º 3
0
    def test_train_model_with_bn(self):
        class SimpleTorchModel(nn.Module):
            def __init__(self):
                super(SimpleTorchModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.bn1 = torch.nn.BatchNorm1d(4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = self.bn1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        torch_model = SimpleTorchModel()
        loss_fn = torch.nn.BCELoss()
        az_model = TorchModel.from_pytorch(torch_model)
        zoo_loss = TorchLoss.from_pytorch(loss_fn)
        inputs = torch.Tensor([[1, 2], [1, 3], [3, 2],
                               [5, 6], [8, 9], [1, 9]])
        targets = torch.Tensor([[0], [0], [0],
                               [1], [1], [1]])
        train_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2)
        train_featureset = FeatureSet.pytorch_dataloader(train_loader)
        val_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2)
        val_featureset = FeatureSet.pytorch_dataloader(val_loader)

        zooOptimizer = Adam()
        estimator = Estimator(az_model, optim_methods=zooOptimizer)
        estimator.train_minibatch(train_featureset, zoo_loss, end_trigger=MaxEpoch(4),
                                  checkpoint_trigger=EveryEpoch(),
                                  validation_set=val_featureset,
                                  validation_method=[Accuracy()])

        trained_model = az_model.to_pytorch()
Exemplo n.º 4
0
    def load(self, model_path):
        """
        Load the Estimator state (model and possibly with optimizer) from provided model_path.
        The model file should be generated by the save method of this estimator, or by
        ``torch.save(state_dict, model_path)``, where `state_dict` can be obtained by
        the ``state_dict()`` method of a pytorch model.

        :param model_path: path to the saved model.
        :return:
        """

        from zoo.pipeline.api.torch import TorchModel
        import os

        try:
            pytorch_model = self.get_model()
            pytorch_model.load_state_dict(torch.load(model_path))
            self.model = TorchModel.from_pytorch(pytorch_model)
        except Exception:
            raise ValueError(
                "Cannot load the PyTorch model. Please check your model path.")

        optim_path = self._get_optimizer_path(model_path)
        if os.path.isfile(optim_path):
            try:
                self.optimizer = OptimMethod.load(optim_path)
            except Exception:
                raise ValueError(
                    "Cannot load the optimizer. Only `bigdl.optim.optimizer."
                    "OptimMethod` is supported for loading.")
        else:
            self.optimizer = None

        self.estimator = SparkEstimator(self.model, self.optimizer,
                                        self.model_dir)
Exemplo n.º 5
0
    def load_orca_checkpoint(self, path, version=None, prefix=None):
        """
        Load existing checkpoint. To load a specific checkpoint, please provide both `version` and
        `perfix`. If `version` is None, then the latest checkpoint will be loaded.

        :param path: Path to the existing checkpoint (or directory containing Orca checkpoint
               files).
        :param version: checkpoint version, which is the suffix of model.* file, i.e., for
               modle.4 file, the version is 4. If it is None, then load the latest checkpoint.
        :param prefix: optimMethod prefix, for example 'optimMethod-TorchModelf53bddcc'.
        :return:
        """
        import os
        from bigdl.nn.layer import Model
        from bigdl.optim.optimizer import OptimMethod
        from zoo.orca.learn.utils import find_latest_checkpoint
        from zoo.pipeline.api.torch import TorchModel

        if version is None:
            path, prefix, version = find_latest_checkpoint(path, model_type="pytorch")
            if path is None:
                raise ValueError("Cannot find PyTorch checkpoint, please check your checkpoint"
                                 " path.")
        else:
            assert prefix is not None, "You should provide optimMethod prefix, " \
                                       "for example 'optimMethod-TorchModelf53bddcc'"

        try:
            loaded_model = Model.load(os.path.join(path, "model.{}".format(version)))
            self.model = TorchModel.from_value(loaded_model.value)
            self.optimizer = OptimMethod.load(os.path.join(path, "{}.{}".format(prefix, version)))
        except Exception:
            raise ValueError("Cannot load PyTorch checkpoint, please check your checkpoint path "
                             "and checkpoint type.")
        self.estimator = SparkEstimator(self.model, self.optimizer, self.model_dir)
Exemplo n.º 6
0
    def test_torch_net_predict_resnet(self):
        torch.random.manual_seed(1)
        pytorch_model = torchvision.models.resnet18(pretrained=False).eval()
        zoo_model = TorchModel.from_pytorch(pytorch_model)
        zoo_model.evaluate()

        dummy_input = torch.ones(1, 3, 224, 224)
        pytorch_result = pytorch_model(dummy_input).data.numpy()
        zoo_result = zoo_model.forward(dummy_input.numpy())
        print(pytorch_result)
        print(zoo_result)
        assert np.allclose(pytorch_result, zoo_result, rtol=1.e-6, atol=1.e-6)
Exemplo n.º 7
0
 def __init__(self, model, loss, optimizer, model_dir=None, bigdl_type="float"):
     from zoo.pipeline.api.torch import TorchModel, TorchLoss
     self.loss = loss
     if self.loss is None:
         self.loss = TorchLoss()
     else:
         self.loss = TorchLoss.from_pytorch(loss)
     if optimizer is None:
         from bigdl.optim.optimizer import SGD
         optimizer = SGD()
     self.model = TorchModel.from_pytorch(model)
     self.estimator = SparkEstimator(self.model, optimizer, model_dir, bigdl_type=bigdl_type)
Exemplo n.º 8
0
    def test_model_save_and_load(self):
        class SimpleTorchModel(nn.Module):
            def __init__(self):
                super(SimpleTorchModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        torch_model = SimpleTorchModel()
        az_model = TorchModel.from_pytorch(torch_model)

        with tempfile.TemporaryDirectory() as tmp_dir_name:
            path = tmp_dir_name + "/model.obj"
            az_model.save(path, True)
            loaded_model = Model.load(path)
            loaded_torchModel = TorchModel.from_value(loaded_model.value)
            dummy_input = torch.ones(16, 2)
            loaded_torchModel.forward(dummy_input.numpy())
            loaded_torchModel.to_pytorch()
Exemplo n.º 9
0
    def test_train_model_function_with_bn(self):
        class SimpleTorchModel(nn.Module):
            def __init__(self):
                super(SimpleTorchModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        torch_model = SimpleTorchModel()
        az_model = TorchModel.from_pytorch(torch_model)

        weights = az_model.get_weights()
        weights[0][0] = 1.0
        az_model.set_weights(weights)

        exported_model = az_model.to_pytorch()
Exemplo n.º 10
0
    def test_model_with_bn_to_pytorch(self):
        class SimpleTorchModel(nn.Module):
            def __init__(self):
                super(SimpleTorchModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.bn1 = torch.nn.BatchNorm1d(4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = self.bn1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        torch_model = SimpleTorchModel()
        az_model = TorchModel.from_pytorch(torch_model)
        dummy_input = torch.ones(16, 2)
        zoo_result = az_model.forward(dummy_input.numpy())

        exported_model = az_model.to_pytorch()
        assert len((list(exported_model.named_buffers()))) != 0
Exemplo n.º 11
0
    def test_model_to_pytorch(self):
        class SimpleTorchModel(nn.Module):
            def __init__(self):
                super(SimpleTorchModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        torch_model = SimpleTorchModel()
        az_model = TorchModel.from_pytorch(torch_model)

        weights = az_model.get_weights()
        weights[0][0] = 1.0
        az_model.set_weights(weights)

        exported_model = az_model.to_pytorch()
        p = list(exported_model.parameters())
        assert p[0][0][0] == 1.0
Exemplo n.º 12
0
    return FeatureSet.sample_rdd(sample_rdd, shuffle=shuffle)


for fold_, (trn_, val_) in enumerate(folds.split(train_X)):
    print("fold {}".format(fold_ + 1))

    x_train = Variable(torch.Tensor(train_X[trn_.astype(int)]))
    y_train = Variable(torch.Tensor(train_Y[trn_.astype(int), np.newaxis]))

    x_valid = Variable(torch.Tensor(train_X[val_.astype(int)]))
    y_valid = Variable(torch.Tensor(train_Y[val_.astype(int), np.newaxis]))

    model = MLP(x_train.shape[1], 512, classes, dropout=0.4)
    loss_fn = torch.nn.BCELoss()
    zooOptimizer = Adam(learningrate=learning_rate)
    zooModel = TorchModel.from_pytorch(model)
    zooLoss = TorchLoss.from_pytorch(loss_fn)

    train_featureSet = get_featureset(x_train, y_train, shuffle=True)
    val_featureSet = get_featureset(x_valid, y_valid, shuffle=False)

    estimator = Estimator(zooModel, optim_methods=zooOptimizer)
    estimator.train(train_featureSet,
                    zooLoss,
                    end_trigger=MaxEpoch(epochs),
                    checkpoint_trigger=EveryEpoch(),
                    validation_set=val_featureSet,
                    validation_method=[Accuracy()],
                    batch_size=batch_size)

Exemplo n.º 13
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--dir',
                        default='/tmp/data',
                        metavar='N',
                        help='the folder store mnist data')
    parser.add_argument(
        '--batch-size',
        type=int,
        default=256,
        metavar='N',
        help='input batch size for training per executor(default: 256)')
    parser.add_argument(
        '--test-batch-size',
        type=int,
        default=1000,
        metavar='N',
        help='input batch size for testing per executor(default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=2,
                        metavar='N',
                        help='number of epochs to train (default: 2)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        metavar='LR',
                        help='learning rate (default: 0.001)')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        args.dir,
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        args.dir,
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                              batch_size=args.test_batch_size,
                                              shuffle=False)

    # init on yarn when HADOOP_CONF_DIR and ZOO_CONDA_NAME is provided.
    if os.environ.get('HADOOP_CONF_DIR') is None:
        sc = init_spark_on_local(cores=1, conf={"spark.driver.memory": "20g"})
    else:
        num_executors = 2
        num_cores_per_executor = 4
        hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR')
        zoo_conda_name = os.environ.get(
            'ZOO_CONDA_NAME')  # The name of the created conda-env
        sc = init_spark_on_yarn(hadoop_conf=hadoop_conf_dir,
                                conda_name=zoo_conda_name,
                                num_executor=num_executors,
                                executor_cores=num_cores_per_executor,
                                executor_memory="2g",
                                driver_memory="10g",
                                driver_cores=1,
                                spark_conf={
                                    "spark.rpc.message.maxSize":
                                    "1024",
                                    "spark.task.maxFailures":
                                    "1",
                                    "spark.driver.extraJavaOptions":
                                    "-Dbigdl.failure.retryTimes=1"
                                })

    model = Net()
    model.train()
    criterion = nn.NLLLoss()

    adam = Adam(args.lr)
    zoo_model = TorchModel.from_pytorch(model)
    zoo_criterion = TorchLoss.from_pytorch(criterion)
    zoo_estimator = Estimator(zoo_model, optim_methods=adam)
    train_featureset = FeatureSet.pytorch_dataloader(train_loader)
    test_featureset = FeatureSet.pytorch_dataloader(test_loader)
    from bigdl.optim.optimizer import MaxEpoch, EveryEpoch
    zoo_estimator.train_minibatch(train_featureset,
                                  zoo_criterion,
                                  end_trigger=MaxEpoch(args.epochs),
                                  checkpoint_trigger=EveryEpoch(),
                                  validation_set=test_featureset,
                                  validation_method=[Accuracy()])
Exemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
    parser.add_argument('data', metavar='DIR', help='path to dataset')
    parser.add_argument('-a',
                        '--arch',
                        metavar='ARCH',
                        default='resnet18',
                        choices=model_names,
                        help='model architecture: ' + ' | '.join(model_names) +
                        ' (default: resnet18)')
    parser.add_argument('--epochs',
                        default=90,
                        type=int,
                        metavar='N',
                        help='number of total epochs to run')
    parser.add_argument('--start-epoch',
                        default=0,
                        type=int,
                        metavar='N',
                        help='manual epoch number (useful on restarts)')
    parser.add_argument(
        '-b',
        '--batch-size',
        default=256,
        type=int,
        metavar='N',
        help='mini-batch size (default: 256), this is the total '
        'batch size of all GPUs on the current node when '
        'using Data Parallel or Distributed Data Parallel')
    parser.add_argument('--lr',
                        '--learning-rate',
                        default=0.1,
                        type=float,
                        metavar='LR',
                        help='initial learning rate',
                        dest='lr')
    parser.add_argument('--momentum',
                        default=0.9,
                        type=float,
                        metavar='M',
                        help='momentum')
    parser.add_argument('--wd',
                        '--weight-decay',
                        default=1e-4,
                        type=float,
                        metavar='W',
                        help='weight decay (default: 1e-4)',
                        dest='weight_decay')
    parser.add_argument('-p',
                        '--print-freq',
                        default=10,
                        type=int,
                        metavar='N',
                        help='print frequency (default: 10)')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('-e',
                        '--evaluate',
                        dest='evaluate',
                        action='store_true',
                        help='evaluate model on validation set')
    parser.add_argument('--pretrained',
                        dest='pretrained',
                        action='store_true',
                        help='use pre-trained model')
    parser.add_argument('--world-size',
                        default=-1,
                        type=int,
                        help='number of nodes for distributed training')
    parser.add_argument('--rank',
                        default=-1,
                        type=int,
                        help='node rank for distributed training')
    parser.add_argument('--seed',
                        default=None,
                        type=int,
                        help='seed for initializing training. ')
    parser.add_argument('--cores',
                        default=4,
                        type=int,
                        help='num of CPUs to use.')
    parser.add_argument('--nodes',
                        default=1,
                        type=int,
                        help='num of nodes to use.')
    parser.add_argument('--executor_memory',
                        default='20g',
                        type=str,
                        help='size of executor memory.')
    parser.add_argument('--driver_memory',
                        default='20g',
                        type=str,
                        help='size of driver memory.')
    parser.add_argument('--driver_cores',
                        default=1,
                        type=int,
                        help='num of driver cores to use.')
    args = parser.parse_args()
    # sc = init_nncontext()
    if os.environ.get('HADOOP_CONF_DIR') is None:
        sc = init_spark_on_local(cores=args.cores,
                                 conf={"spark.driver.memory": "20g"})
    else:
        hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR')
        num_executors = args.nodes
        executor_memory = args.executor_memory
        driver_memory = args.driver_memory
        driver_cores = args.driver_cores
        num_cores_per_executor = args.cores
        os.environ['ZOO_MKL_NUMTHREADS'] = str(num_cores_per_executor)
        os.environ['OMP_NUM_THREADS'] = str(num_cores_per_executor)
        sc = init_spark_on_yarn(
            hadoop_conf=hadoop_conf_dir,
            conda_name=detect_python_location().split("/")
            [-3],  # The name of the created conda-env
            num_executors=num_executors,
            executor_cores=num_cores_per_executor,
            executor_memory=executor_memory,
            driver_memory=driver_memory,
            driver_cores=driver_cores,
            conf={
                "spark.rpc.message.maxSize": "1024",
                "spark.task.maxFailures": "1",
                "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1"
            })

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True)

    model = torchvision.models.resnet50()
    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False)

    iterationPerEpoch = int(math.ceil(float(1281167) / args.batch_size))
    step = Step(iterationPerEpoch * 30, 0.1)
    zooOptimizer = SGD(args.lr,
                       momentum=args.momentum,
                       dampening=0.0,
                       leaningrate_schedule=step,
                       weightdecay=args.weight_decay)
    zooModel = TorchModel.from_pytorch(model)
    criterion = torch.nn.CrossEntropyLoss()
    zooCriterion = TorchLoss.from_pytorch(criterion)
    estimator = Estimator(zooModel, optim_methods=zooOptimizer)
    train_featureSet = FeatureSet.pytorch_dataloader(train_loader)
    test_featureSet = FeatureSet.pytorch_dataloader(val_loader)
    estimator.train_minibatch(train_featureSet,
                              zooCriterion,
                              end_trigger=MaxEpoch(90),
                              checkpoint_trigger=EveryEpoch(),
                              validation_set=test_featureSet,
                              validation_method=[Accuracy(),
                                                 Top5Accuracy()])
Exemplo n.º 15
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Example')
    parser.add_argument('--dir',
                        default='/tmp/data',
                        metavar='N',
                        help='the folder store cifar10 data')
    parser.add_argument(
        '--batch-size',
        type=int,
        default=128,
        metavar='N',
        help='input batch size for training per executor(default: 128)')
    parser.add_argument(
        '--test-batch-size',
        type=int,
        default=1000,
        metavar='N',
        help='input batch size for testing per executor(default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=135,
                        metavar='N',
                        help='number of epochs to train (default: 135)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--lrd',
                        type=float,
                        default=0.0,
                        metavar='LRD',
                        help='learning rate decay(default: 0.0)')
    parser.add_argument('--wd',
                        type=float,
                        default=5e-4,
                        metavar='WD',
                        help='weight decay(default: 5e-4)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.9,
                        metavar='momentum',
                        help='momentum (default: 0.9)')
    parser.add_argument('--dampening',
                        type=float,
                        default=0.0,
                        metavar='dampening',
                        help='dampening (default: 0.0)')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    # 准备数据并预处理
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),  # 先四周填充0,在吧图像随机裁剪成32*32
        transforms.RandomHorizontalFlip(),  # 图像一半的概率翻转,一半的概率不翻转
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),  # R,G,B每层的归一化用到的均值和方差
    ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])
    train_set = datasets.CIFAR10(args.dir,
                                 train=True,
                                 download=True,
                                 transform=transform_train)
    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=2)

    test_set = datasets.CIFAR10(args.dir,
                                train=False,
                                transform=transform_test)
    test_loader = torch.utils.data.DataLoader(test_set,
                                              batch_size=args.test_batch_size,
                                              shuffle=False,
                                              num_workers=2)

    # init on yarn when HADOOP_CONF_DIR and ZOO_CONDA_NAME is provided.
    if os.environ.get('HADOOP_CONF_DIR') is None:
        sc = init_spark_on_local(cores=1, conf={"spark.driver.memory": "20g"})
    else:
        num_executors = 2
        num_cores_per_executor = 4
        hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR')
        zoo_conda_name = os.environ.get(
            'ZOO_CONDA_NAME')  # The name of the created conda-env
        sc = init_spark_on_yarn(hadoop_conf=hadoop_conf_dir,
                                conda_name=zoo_conda_name,
                                num_executor=num_executors,
                                executor_cores=num_cores_per_executor,
                                executor_memory="2g",
                                driver_memory="10g",
                                driver_cores=1,
                                spark_conf={
                                    "spark.rpc.message.maxSize":
                                    "1024",
                                    "spark.task.maxFailures":
                                    "1",
                                    "spark.driver.extraJavaOptions":
                                    "-Dbigdl.failure.retryTimes=1"
                                })

    model = ResNet18()
    model.train()
    criterion = nn.CrossEntropyLoss()

    optimizer = SGD(args.lr, args.lrd, args.wd, args.momentum, args.dampening)
    zoo_model = TorchModel.from_pytorch(model)
    zoo_criterion = TorchLoss.from_pytorch(criterion)
    zoo_estimator = Estimator(zoo_model, optim_methods=optimizer)
    train_featureset = FeatureSet.pytorch_dataloader(train_loader)
    test_featureset = FeatureSet.pytorch_dataloader(test_loader)
    from bigdl.optim.optimizer import MaxEpoch, EveryEpoch
    zoo_estimator.train_minibatch(train_featureset,
                                  zoo_criterion,
                                  end_trigger=MaxEpoch(args.epochs),
                                  checkpoint_trigger=EveryEpoch(),
                                  validation_set=test_featureset,
                                  validation_method=[Accuracy()])
Exemplo n.º 16
0
                                })

    iter_per_epoch = len(cifar100_training_loader)
    warmup_delta = args.lr / (iter_per_epoch * args.warm)
    # iteration_per_epoch = int(math.ceil(float(len(cifar100_training_loader)) / args.b))
    zoo_lrSchedule = SequentialSchedule(iter_per_epoch)
    zoo_lrSchedule.add(Warmup(warmup_delta), iter_per_epoch * args.warm)
    zoo_lrSchedule.add(
        MultiStep(
            [iter_per_epoch * 60, iter_per_epoch * 120, iter_per_epoch * 160],
            0.2), iter_per_epoch * 200)
    zoo_optim = SGD(learningrate=0.0,
                    learningrate_decay=0.0,
                    weightdecay=5e-4,
                    momentum=0.9,
                    dampening=0.0,
                    nesterov=False,
                    leaningrate_schedule=zoo_lrSchedule)

    zoo_model = TorchModel.from_pytorch(net)
    zoo_loss = TorchLoss.from_pytorch(loss_function)
    zoo_estimator = Estimator(zoo_model, optim_methods=zoo_optim)
    train_featureset = FeatureSet.pytorch_dataloader(cifar100_training_loader)
    test_featureset = FeatureSet.pytorch_dataloader(cifar100_test_loader)
    zoo_estimator.train_minibatch(train_featureset,
                                  zoo_loss,
                                  end_trigger=MaxEpoch(settings.EPOCH),
                                  checkpoint_trigger=EveryEpoch(),
                                  validation_set=test_featureset,
                                  validation_method=[Accuracy()])