def test_bigdl_pytorch_estimator_dataloader_creator(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.bn1 = torch.nn.BatchNorm1d(4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = self.bn1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        model = SimpleModel()

        estimator = Estimator.from_torch(model=model, loss=nn.BCELoss(),
                                         optimizer=Adam(), backend="bigdl")

        def get_dataloader():
            inputs = torch.Tensor([[1, 2], [1, 3], [3, 2], [5, 6], [8, 9], [1, 9]])
            targets = torch.Tensor([[0], [0], [0], [1], [1], [1]])
            return torch.utils.data.DataLoader(TensorDataset(inputs, targets), batch_size=2)

        estimator.fit(data=get_dataloader, epochs=2, validation_data=get_dataloader,
                      validation_methods=[Accuracy()], checkpoint_trigger=EveryEpoch())
        estimator.evaluate(data=get_dataloader, validation_methods=[Accuracy()])
        model = estimator.get_model()
        assert isinstance(model, nn.Module)
示例#2
0
    def test_tf_optimizer_metrics(self):

        features = np.random.randn(20, 10)
        labels = np.random.randint(0, 10, size=[20])
        with tf.Graph().as_default():
            dataset = TFDataset.from_ndarrays((features, labels),
                                              batch_size=4,
                                              val_tensors=(features, labels))
            feature_tensor, label_tensor = dataset.tensors
            features = tf.layers.dense(feature_tensor, 8)
            output = tf.layers.dense(features, 10)
            loss = tf.reduce_mean(tf.losses.
                                  sparse_softmax_cross_entropy(logits=output,
                                                               labels=label_tensor))
            optimizer = TFOptimizer.from_loss(loss, {"dense/": Adam(1e-3), "dense_1/": SGD(0.0)},
                                              val_outputs=[output],
                                              val_labels=[label_tensor],
                                              val_method=Accuracy(), metrics={"loss": loss})
            initial_weights = optimizer.tf_model.training_helper_layer.get_weights()
            optimizer.optimize(end_trigger=MaxEpoch(1))
            updated_weights = optimizer.tf_model.training_helper_layer.get_weights()
            for i in [0, 1]:  # weights and bias combined with "dense/" should be updated
                assert not np.allclose(initial_weights[i], updated_weights[i])
            for i in [2, 3]:  # weights and bias combined with "dense_1" should be unchanged
                assert np.allclose(initial_weights[i], updated_weights[i])
            optimizer.sess.close()
示例#3
0
    def test_train_model_with_bn(self):
        class SimpleTorchModel(nn.Module):
            def __init__(self):
                super(SimpleTorchModel, self).__init__()
                self.dense1 = nn.Linear(2, 4)
                self.bn1 = torch.nn.BatchNorm1d(4)
                self.dense2 = nn.Linear(4, 1)

            def forward(self, x):
                x = self.dense1(x)
                x = self.bn1(x)
                x = torch.sigmoid(self.dense2(x))
                return x

        torch_model = SimpleTorchModel()
        loss_fn = torch.nn.BCELoss()
        az_model = TorchModel.from_pytorch(torch_model)
        zoo_loss = TorchLoss.from_pytorch(loss_fn)
        inputs = torch.Tensor([[1, 2], [1, 3], [3, 2],
                               [5, 6], [8, 9], [1, 9]])
        targets = torch.Tensor([[0], [0], [0],
                               [1], [1], [1]])
        train_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2)
        train_featureset = FeatureSet.pytorch_dataloader(train_loader)
        val_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2)
        val_featureset = FeatureSet.pytorch_dataloader(val_loader)

        zooOptimizer = Adam()
        estimator = Estimator(az_model, optim_methods=zooOptimizer)
        estimator.train_minibatch(train_featureset, zoo_loss, end_trigger=MaxEpoch(4),
                                  checkpoint_trigger=EveryEpoch(),
                                  validation_set=val_featureset,
                                  validation_method=[Accuracy()])

        trained_model = az_model.to_pytorch()
示例#4
0
    def test_control_inputs(self):

        features = np.random.randn(20, 10)
        labels = np.random.randint(0, 10, size=[20])
        with tf.Graph().as_default():
            dataset = TFDataset.from_ndarrays((features, labels),
                                              batch_size=4,
                                              val_tensors=(features, labels))
            is_training = tf.placeholder(dtype=tf.bool, shape=())
            feature_tensor, label_tensor = dataset.tensors
            features = tf.layers.dense(feature_tensor, 8)
            features = tf.layers.dropout(features, training=is_training)
            output = tf.layers.dense(features, 10)
            loss = tf.reduce_mean(
                tf.losses.sparse_softmax_cross_entropy(logits=output,
                                                       labels=label_tensor))
            optimizer = TFOptimizer.from_loss(
                loss,
                Adam(),
                val_outputs=[output],
                val_labels=[label_tensor],
                val_method=Accuracy(),
                tensor_with_value={is_training: (True, False)},
                metrics={"loss": loss})
            optimizer.optimize(end_trigger=MaxEpoch(1))
            optimizer.sess.close()
    def test_bigdl_pytorch_estimator_shard(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.fc = nn.Linear(2, 2)

            def forward(self, x):
                x = self.fc(x)
                return F.log_softmax(x, dim=1)

        model = SimpleModel()

        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input,
                                                 target.flatten().long())

        def transform(df):
            result = {
                "x": [df['user'].to_numpy(), df['item'].to_numpy()],
                "y": df['label'].to_numpy()
            }
            return result

        OrcaContext.pandas_read_backend = "pandas"
        file_path = os.path.join(resource_path, "orca/learn/ncf.csv")
        data_shard = read_csv(file_path)
        data_shard = data_shard.transform_shard(transform)

        estimator = Estimator.from_torch(model=model,
                                         loss=loss_func,
                                         optimizer=SGD(),
                                         backend="bigdl")
        estimator.fit(data=data_shard,
                      epochs=4,
                      batch_size=2,
                      validation_data=data_shard,
                      validation_methods=[Accuracy()],
                      checkpoint_trigger=EveryEpoch())
        estimator.evaluate(data_shard,
                           validation_methods=[Accuracy()],
                           batch_size=2)
示例#6
0
    def test_checkpoint(self):

        features = np.random.randn(20, 10)
        labels = np.random.randint(0, 10, size=[20])
        with tf.Graph().as_default():
            dataset = TFDataset.from_ndarrays((features, labels),
                                              batch_size=4,
                                              val_tensors=(features, labels))
            feature_tensor, label_tensor = dataset.tensors
            features = tf.layers.dense(feature_tensor, 8)
            output = tf.layers.dense(features, 10)
            loss = tf.reduce_mean(tf.losses.
                                  sparse_softmax_cross_entropy(logits=output,
                                                               labels=label_tensor))
            model_dir = tempfile.mkdtemp()
            try:
                optimizer = TFOptimizer.from_loss(loss, Adam(),
                                                  val_outputs=[output],
                                                  val_labels=[label_tensor],
                                                  val_method=Accuracy(),
                                                  metrics={"loss": loss}, model_dir=model_dir)
                optimizer.optimize(end_trigger=MaxEpoch(1))

                import re
                ckpt_path = None
                versions = []
                for (root, dirs, files) in os.walk(model_dir, topdown=True):
                    temp_versions = []
                    for file_name in files:
                        if re.match("^optimMethod-TFParkTraining\.[0-9]+$", file_name) is not None:
                            version = int(file_name.split(".")[1])
                            temp_versions.append(version)
                    if temp_versions:
                        ckpt_path = root
                        versions = temp_versions
                        break

                assert ckpt_path is not None, "Cannot fine checkpoint file"

                optimizer.load_checkpoint(ckpt_path, max(versions))
                optimizer.optimize(end_trigger=MaxEpoch(1))
                optimizer.sess.close()
            finally:
                import shutil
                shutil.rmtree(model_dir)
示例#7
0
    def test_tf_optimizer_metrics(self):

        features = np.random.randn(20, 10)
        labels = np.random.randint(0, 10, size=[20])
        with tf.Graph().as_default():
            dataset = TFDataset.from_ndarrays((features, labels),
                                              batch_size=4,
                                              val_tensors=(features, labels))
            feature_tensor, label_tensor = dataset.tensors
            output = tf.layers.dense(feature_tensor, 10)
            loss = tf.reduce_mean(
                tf.losses.sparse_softmax_cross_entropy(logits=output,
                                                       labels=label_tensor))
            optimizer = TFOptimizer.from_loss(loss,
                                              Adam(1e-3),
                                              val_outputs=[output],
                                              val_labels=[label_tensor],
                                              val_method=Accuracy(),
                                              metrics={"loss": loss})
            optimizer.optimize(end_trigger=MaxEpoch(1))
            optimizer.sess.close()
        .withColumn("label", getLabel(col('name'))).cache()
    (trainingDF, validationDF) = labelDF.randomSplit([0.9, 0.1])

    # run training and evaluation
    featureTransformer = ChainedPreprocessing(
        [RowToImageFeature(), ImageCenterCrop(224, 224),
         ImageChannelNormalize(123.0, 117.0, 104.0, 255.0, 255.0, 255.0),
         ImageMatToTensor(), ImageFeatureToTensor()])

    classifier = NNClassifier(torchnet, torchcriterion, featureTransformer) \
        .setLearningRate(0.001) \
        .setBatchSize(8) \
        .setMaxEpoch(2) \
        .setFeaturesCol("image") \
        .setCachingSample(False) \
        .setValidation(EveryEpoch(), validationDF, [Accuracy()], 8)

    catdogModel = classifier.fit(trainingDF)

    shift = udf(lambda p: p - 1, DoubleType())
    predictionDF = catdogModel.transform(validationDF) \
        .withColumn("prediction", shift(col('prediction'))).cache()
    predictionDF.sample(False, 0.1).show()

    correct = predictionDF.filter("label=prediction").count()
    overall = predictionDF.count()
    accuracy = correct * 1.0 / overall

    # expecting: accuracy > 96%
    print("Validation accuracy = %g " % accuracy)
示例#9
0
    model = MLP(x_train.shape[1], 512, classes, dropout=0.4)
    loss_fn = torch.nn.BCELoss()
    zooOptimizer = Adam(learningrate=learning_rate)
    zooModel = TorchModel.from_pytorch(model)
    zooLoss = TorchLoss.from_pytorch(loss_fn)

    train_featureSet = get_featureset(x_train, y_train, shuffle=True)
    val_featureSet = get_featureset(x_valid, y_valid, shuffle=False)

    estimator = Estimator(zooModel, optim_methods=zooOptimizer)
    estimator.train(train_featureSet,
                    zooLoss,
                    end_trigger=MaxEpoch(epochs),
                    checkpoint_trigger=EveryEpoch(),
                    validation_set=val_featureSet,
                    validation_method=[Accuracy()],
                    batch_size=batch_size)


# Predict
def get_rdd(x, y, shuffle=False):
    x = np.split(x.data.numpy(), x.shape[0])
    y = np.split(y.data.numpy(), y.shape[0])
    samples = [
        Sample.from_ndarray(np.squeeze(x[i]), np.squeeze(y[i]))
        for i in range(len(x))
    ]
    sample_rdd = sc.parallelize(samples)
    return sample_rdd

示例#10
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--dir',
                        default='/tmp/data',
                        metavar='N',
                        help='the folder store mnist data')
    parser.add_argument(
        '--batch-size',
        type=int,
        default=256,
        metavar='N',
        help='input batch size for training per executor(default: 256)')
    parser.add_argument(
        '--test-batch-size',
        type=int,
        default=1000,
        metavar='N',
        help='input batch size for testing per executor(default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=2,
                        metavar='N',
                        help='number of epochs to train (default: 2)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        metavar='LR',
                        help='learning rate (default: 0.001)')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        args.dir,
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        args.dir,
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                              batch_size=args.test_batch_size,
                                              shuffle=False)

    # init on yarn when HADOOP_CONF_DIR and ZOO_CONDA_NAME is provided.
    if os.environ.get('HADOOP_CONF_DIR') is None:
        sc = init_spark_on_local(cores=1, conf={"spark.driver.memory": "20g"})
    else:
        num_executors = 2
        num_cores_per_executor = 4
        hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR')
        zoo_conda_name = os.environ.get(
            'ZOO_CONDA_NAME')  # The name of the created conda-env
        sc = init_spark_on_yarn(hadoop_conf=hadoop_conf_dir,
                                conda_name=zoo_conda_name,
                                num_executors=num_executors,
                                executor_cores=num_cores_per_executor,
                                executor_memory="2g",
                                driver_memory="10g",
                                driver_cores=1,
                                conf={
                                    "spark.rpc.message.maxSize":
                                    "1024",
                                    "spark.task.maxFailures":
                                    "1",
                                    "spark.driver.extraJavaOptions":
                                    "-Dbigdl.failure.retryTimes=1"
                                })

    model = LeNet()
    model.train()
    criterion = nn.NLLLoss()

    adam = Adam(args.lr)
    zoo_estimator = Estimator.from_torch(model=model,
                                         optimizer=adam,
                                         loss=criterion,
                                         backend="bigdl")
    from bigdl.optim.optimizer import EveryEpoch
    zoo_estimator.fit(data=train_loader,
                      epochs=args.epochs,
                      validation_data=test_loader,
                      validation_methods=[Accuracy()],
                      checkpoint_trigger=EveryEpoch())
    zoo_estimator.evaluate(data=test_loader, validation_methods=[Accuracy()])
示例#11
0
        model = TextClassifier(CLASS_NUM, token_length,
                               sequence_len, options.encoder,
                               int(options.encoder_output_dim))

    optimizer = Optimizer(model=model,
                          training_rdd=train_rdd,
                          criterion=SparseCategoricalCrossEntropy(),
                          end_trigger=MaxEpoch(int(options.nb_epoch)),
                          batch_size=batch_size,
                          optim_method=Adagrad(learningrate=float(
                              options.learning_rate),
                                               learningrate_decay=0.001))
    optimizer.set_validation(batch_size=batch_size,
                             val_rdd=val_rdd,
                             trigger=EveryEpoch(),
                             val_method=[Accuracy()])

    log_dir = options.log_dir
    app_name = 'adam-' + dt.datetime.now().strftime("%Y%m%d-%H%M%S")
    train_summary = TrainSummary(log_dir=log_dir, app_name=app_name)
    train_summary.set_summary_trigger("Parameters", SeveralIteration(50))
    val_summary = ValidationSummary(log_dir=log_dir, app_name=app_name)
    optimizer.set_train_summary(train_summary)
    optimizer.set_val_summary(val_summary)

    optimizer.optimize()

    # Predict for probability distributions
    results = model.predict(val_rdd)
    results.take(5)
    # Predict for labels
示例#12
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
    parser.add_argument('data', metavar='DIR', help='path to dataset')
    parser.add_argument('-a',
                        '--arch',
                        metavar='ARCH',
                        default='resnet18',
                        choices=model_names,
                        help='model architecture: ' + ' | '.join(model_names) +
                        ' (default: resnet18)')
    parser.add_argument('--epochs',
                        default=90,
                        type=int,
                        metavar='N',
                        help='number of total epochs to run')
    parser.add_argument('--start-epoch',
                        default=0,
                        type=int,
                        metavar='N',
                        help='manual epoch number (useful on restarts)')
    parser.add_argument(
        '-b',
        '--batch-size',
        default=256,
        type=int,
        metavar='N',
        help='mini-batch size (default: 256), this is the total '
        'batch size of all GPUs on the current node when '
        'using Data Parallel or Distributed Data Parallel')
    parser.add_argument('--lr',
                        '--learning-rate',
                        default=0.1,
                        type=float,
                        metavar='LR',
                        help='initial learning rate',
                        dest='lr')
    parser.add_argument('--momentum',
                        default=0.9,
                        type=float,
                        metavar='M',
                        help='momentum')
    parser.add_argument('--wd',
                        '--weight-decay',
                        default=1e-4,
                        type=float,
                        metavar='W',
                        help='weight decay (default: 1e-4)',
                        dest='weight_decay')
    parser.add_argument('-p',
                        '--print-freq',
                        default=10,
                        type=int,
                        metavar='N',
                        help='print frequency (default: 10)')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('-e',
                        '--evaluate',
                        dest='evaluate',
                        action='store_true',
                        help='evaluate model on validation set')
    parser.add_argument('--pretrained',
                        dest='pretrained',
                        action='store_true',
                        help='use pre-trained model')
    parser.add_argument('--world-size',
                        default=-1,
                        type=int,
                        help='number of nodes for distributed training')
    parser.add_argument('--rank',
                        default=-1,
                        type=int,
                        help='node rank for distributed training')
    parser.add_argument('--seed',
                        default=None,
                        type=int,
                        help='seed for initializing training. ')
    parser.add_argument('--cores',
                        default=4,
                        type=int,
                        help='num of CPUs to use.')
    parser.add_argument('--nodes',
                        default=1,
                        type=int,
                        help='num of nodes to use.')
    parser.add_argument('--executor_memory',
                        default='20g',
                        type=str,
                        help='size of executor memory.')
    parser.add_argument('--driver_memory',
                        default='20g',
                        type=str,
                        help='size of driver memory.')
    parser.add_argument('--driver_cores',
                        default=1,
                        type=int,
                        help='num of driver cores to use.')
    args = parser.parse_args()
    # sc = init_nncontext()
    if os.environ.get('HADOOP_CONF_DIR') is None:
        sc = init_spark_on_local(cores=args.cores,
                                 conf={"spark.driver.memory": "20g"})
    else:
        hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR')
        num_executors = args.nodes
        executor_memory = args.executor_memory
        driver_memory = args.driver_memory
        driver_cores = args.driver_cores
        num_cores_per_executor = args.cores
        os.environ['ZOO_MKL_NUMTHREADS'] = str(num_cores_per_executor)
        os.environ['OMP_NUM_THREADS'] = str(num_cores_per_executor)
        sc = init_spark_on_yarn(
            hadoop_conf=hadoop_conf_dir,
            conda_name=detect_python_location().split("/")
            [-3],  # The name of the created conda-env
            num_executors=num_executors,
            executor_cores=num_cores_per_executor,
            executor_memory=executor_memory,
            driver_memory=driver_memory,
            driver_cores=driver_cores,
            conf={
                "spark.rpc.message.maxSize": "1024",
                "spark.task.maxFailures": "1",
                "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1"
            })

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True)

    model = torchvision.models.resnet50()
    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False)

    iterationPerEpoch = int(math.ceil(float(1281167) / args.batch_size))
    step = Step(iterationPerEpoch * 30, 0.1)
    zooOptimizer = SGD(args.lr,
                       momentum=args.momentum,
                       dampening=0.0,
                       leaningrate_schedule=step,
                       weightdecay=args.weight_decay)
    zooModel = TorchModel.from_pytorch(model)
    criterion = torch.nn.CrossEntropyLoss()
    zooCriterion = TorchLoss.from_pytorch(criterion)
    estimator = Estimator(zooModel, optim_methods=zooOptimizer)
    train_featureSet = FeatureSet.pytorch_dataloader(train_loader)
    test_featureSet = FeatureSet.pytorch_dataloader(val_loader)
    estimator.train_minibatch(train_featureSet,
                              zooCriterion,
                              end_trigger=MaxEpoch(90),
                              checkpoint_trigger=EveryEpoch(),
                              validation_set=test_featureSet,
                              validation_method=[Accuracy(),
                                                 Top5Accuracy()])
示例#13
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=10,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')

    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '../data',
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '../data',
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                              batch_size=args.test_batch_size,
                                              shuffle=True,
                                              **kwargs)

    model = Net().to(device)
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum)

    num_executors = 4
    num_cores_per_executor = 1
    hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR')
    sc = init_spark_on_yarn(
        hadoop_conf=hadoop_conf_dir,
        conda_name=os.
        environ["ZOO_CONDA_NAME"],  # The name of the created conda-env
        num_executor=num_executors,
        executor_cores=num_cores_per_executor,
        executor_memory="10g",
        driver_memory="10g",
        driver_cores=1,
        spark_conf={"spark.rpc.message.maxSize": "1024"})
    model.train()
    sgd = Adam()
    zooModel = TorchNet.from_pytorch(model, [64, 1, 28, 28])

    def lossFunc(input, target):
        return nn.NLLLoss().forward(input, target.flatten().long())

    zooCriterion = TorchCriterion.from_pytorch(lossFunc, [1, 2],
                                               torch.LongTensor([1]))
    # zooCriterion = SparseCategoricalCrossEntropy(zero_based_label=True)
    estimator = Estimator(zooModel, optim_methods=sgd)

    v_input = []
    v_target = []
    for data, target in test_loader:
        v_input.append([data.numpy()])
        v_target.append([target.numpy()])

    test_featureset = FeatureSet.minibatch(v_input, v_target)

    for epoch in range(1, args.epochs + 1):
        train(args, estimator, zooCriterion, train_loader, epoch)
        # test(args, estimator, zooCriterion, test_featureset)
        estimator.evaluate_minibatch(
            test_featureset,
            [Loss(zooCriterion), Accuracy()])

    if (args.save_model):
        torch.save(model.state_dict(), "mnist_cnn.pt")
示例#14
0
文件: main.py 项目: GavinGu07/cifar10
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Example')
    parser.add_argument('--dir',
                        default='/tmp/data',
                        metavar='N',
                        help='the folder store cifar10 data')
    parser.add_argument(
        '--batch-size',
        type=int,
        default=128,
        metavar='N',
        help='input batch size for training per executor(default: 128)')
    parser.add_argument(
        '--test-batch-size',
        type=int,
        default=1000,
        metavar='N',
        help='input batch size for testing per executor(default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=135,
                        metavar='N',
                        help='number of epochs to train (default: 135)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--lrd',
                        type=float,
                        default=0.0,
                        metavar='LRD',
                        help='learning rate decay(default: 0.0)')
    parser.add_argument('--wd',
                        type=float,
                        default=5e-4,
                        metavar='WD',
                        help='weight decay(default: 5e-4)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.9,
                        metavar='momentum',
                        help='momentum (default: 0.9)')
    parser.add_argument('--dampening',
                        type=float,
                        default=0.0,
                        metavar='dampening',
                        help='dampening (default: 0.0)')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    # 准备数据并预处理
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),  # 先四周填充0,在吧图像随机裁剪成32*32
        transforms.RandomHorizontalFlip(),  # 图像一半的概率翻转,一半的概率不翻转
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),  # R,G,B每层的归一化用到的均值和方差
    ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])
    train_set = datasets.CIFAR10(args.dir,
                                 train=True,
                                 download=True,
                                 transform=transform_train)
    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=2)

    test_set = datasets.CIFAR10(args.dir,
                                train=False,
                                transform=transform_test)
    test_loader = torch.utils.data.DataLoader(test_set,
                                              batch_size=args.test_batch_size,
                                              shuffle=False,
                                              num_workers=2)

    # init on yarn when HADOOP_CONF_DIR and ZOO_CONDA_NAME is provided.
    if os.environ.get('HADOOP_CONF_DIR') is None:
        sc = init_spark_on_local(cores=1, conf={"spark.driver.memory": "20g"})
    else:
        num_executors = 2
        num_cores_per_executor = 4
        hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR')
        zoo_conda_name = os.environ.get(
            'ZOO_CONDA_NAME')  # The name of the created conda-env
        sc = init_spark_on_yarn(hadoop_conf=hadoop_conf_dir,
                                conda_name=zoo_conda_name,
                                num_executor=num_executors,
                                executor_cores=num_cores_per_executor,
                                executor_memory="2g",
                                driver_memory="10g",
                                driver_cores=1,
                                spark_conf={
                                    "spark.rpc.message.maxSize":
                                    "1024",
                                    "spark.task.maxFailures":
                                    "1",
                                    "spark.driver.extraJavaOptions":
                                    "-Dbigdl.failure.retryTimes=1"
                                })

    model = ResNet18()
    model.train()
    criterion = nn.CrossEntropyLoss()

    optimizer = SGD(args.lr, args.lrd, args.wd, args.momentum, args.dampening)
    zoo_model = TorchModel.from_pytorch(model)
    zoo_criterion = TorchLoss.from_pytorch(criterion)
    zoo_estimator = Estimator(zoo_model, optim_methods=optimizer)
    train_featureset = FeatureSet.pytorch_dataloader(train_loader)
    test_featureset = FeatureSet.pytorch_dataloader(test_loader)
    from bigdl.optim.optimizer import MaxEpoch, EveryEpoch
    zoo_estimator.train_minibatch(train_featureset,
                                  zoo_criterion,
                                  end_trigger=MaxEpoch(args.epochs),
                                  checkpoint_trigger=EveryEpoch(),
                                  validation_set=test_featureset,
                                  validation_method=[Accuracy()])