Exemplo n.º 1
0
 def __init__(self,
              model,
              loss,
              optimizer,
              metrics=None,
              model_dir=None,
              bigdl_type="float"):
     from zoo.pipeline.api.torch import TorchModel, TorchLoss, TorchOptim
     self.loss = loss
     if self.loss is None:
         self.loss = TorchLoss()
     else:
         self.loss = TorchLoss.from_pytorch(loss)
     if optimizer is None:
         from zoo.orca.learn.optimizers.schedule import Default
         optimizer = SGD(learningrate_schedule=Default())
     if isinstance(optimizer, TorchOptimizer):
         optimizer = TorchOptim.from_pytorch(optimizer)
     elif isinstance(optimizer, OrcaOptimizer):
         optimizer = optimizer.get_optimizer()
     else:
         raise ValueError(
             "Only PyTorch optimizer and orca optimizer are supported")
     from zoo.orca.learn.metrics import Metric
     self.metrics = Metric.convert_metrics_list(metrics)
     self.log_dir = None
     self.app_name = None
     self.model_dir = model_dir
     self.model = TorchModel.from_pytorch(model)
     self.estimator = SparkEstimator(self.model,
                                     optimizer,
                                     model_dir,
                                     bigdl_type=bigdl_type)
    def test_bigdl_pytorch_estimator_dataframe_predict(self):
        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input,
                                                 target.flatten().long())

        class IdentityNet(nn.Module):
            def __init__(self):
                super().__init__()
                # need this line to avoid optimizer raise empty variable list
                self.fc1 = nn.Linear(5, 5)

            def forward(self, input_):
                return input_

        model = IdentityNet()
        rdd = self.sc.range(0, 100)
        df = rdd.map(lambda x: ([float(x)] * 5,
                                [int(np.random.randint(0, 2, size=()))])).toDF(
                                    ["feature", "label"])

        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_torch(
                model=model,
                loss=loss_func,
                optimizer=SGD(learningrate_schedule=Default()),
                model_dir=temp_dir_name)
            result = estimator.predict(df, feature_cols=["feature"])
            expr = "sum(cast(feature <> to_array(prediction) as int)) as error"
            assert result.selectExpr(expr).first()["error"] == 0
Exemplo n.º 3
0
    def test_bigdl_pytorch_estimator_shard(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.fc = nn.Linear(2, 2)

            def forward(self, x):
                x = self.fc(x)
                return F.log_softmax(x, dim=1)

        model = SimpleModel()

        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input, target.flatten().long())

        def transform(df):
            result = {
                "x": np.stack([df['user'].to_numpy(), df['item'].to_numpy()], axis=1),
                "y": df['label'].to_numpy()
            }
            return result

        def transform_del_y(d):
            result = {"x": d["x"]}
            return result

        OrcaContext.pandas_read_backend = "pandas"
        file_path = os.path.join(resource_path, "orca/learn/ncf.csv")
        data_shard = read_csv(file_path)
        data_shard = data_shard.transform_shard(transform)

        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_torch(model=model, loss=loss_func,
                                             metrics=[Accuracy()],
                                             optimizer=SGD(learningrate_schedule=Default()),
                                             model_dir=temp_dir_name)
            estimator.fit(data=data_shard, epochs=4, batch_size=2, validation_data=data_shard,
                          checkpoint_trigger=EveryEpoch())
            estimator.evaluate(data_shard, batch_size=2)
            est2 = Estimator.from_torch(model=model, loss=loss_func,
                                        metrics=[Accuracy()],
                                        optimizer=None)
            est2.load(temp_dir_name, loss=loss_func)
            est2.fit(data=data_shard, epochs=8, batch_size=2, validation_data=data_shard,
                     checkpoint_trigger=EveryEpoch())
            est2.evaluate(data_shard, batch_size=2)
            pred_result = est2.predict(data_shard)
            pred_c = pred_result.collect()
            assert(pred_result, SparkXShards)
            pred_shard = data_shard.transform_shard(transform_del_y)
            pred_result2 = est2.predict(pred_shard)
            pred_c_2 = pred_result2.collect()
            assert (pred_c[0]["prediction"] == pred_c_2[0]["prediction"]).all()
    def test_bigdl_pytorch_estimator_pandas_dataframe(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.fc = nn.Linear(1, 10)

            def forward(self, x):
                x = torch.unsqueeze(x, dim=1)
                x = self.fc(x)
                return F.log_softmax(x, dim=1)

        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input,
                                                 target.flatten().long())

        model = SimpleModel()

        OrcaContext.pandas_read_backend = "pandas"
        file_path = os.path.join(resource_path,
                                 "orca/learn/simple_feature_label.csv")
        data_shard = read_csv(file_path)

        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_torch(
                model=model,
                loss=loss_func,
                metrics=[Accuracy()],
                optimizer=SGD(learningrate_schedule=Default()),
                model_dir=temp_dir_name)
            estimator.fit(data=data_shard,
                          epochs=1,
                          batch_size=4,
                          feature_cols=['feature'],
                          label_cols=['label'],
                          validation_data=data_shard,
                          checkpoint_trigger=EveryEpoch())
            estimator.evaluate(data_shard,
                               batch_size=4,
                               feature_cols=['feature'],
                               label_cols=['label'])
            est2 = Estimator.from_torch(model=model,
                                        loss=loss_func,
                                        metrics=[Accuracy()],
                                        optimizer=None)
            est2.load_orca_checkpoint(temp_dir_name)
            est2.predict(data_shard, batch_size=4, feature_cols=['feature'])
    def test_bigdl_pytorch_estimator_dataframe_fit_evaluate(self):
        class SimpleModel(nn.Module):
            def __init__(self):
                super(SimpleModel, self).__init__()
                self.fc = nn.Linear(5, 5)

            def forward(self, x):
                x = self.fc(x)
                return F.log_softmax(x, dim=1)

        model = SimpleModel()

        def loss_func(input, target):
            return nn.CrossEntropyLoss().forward(input,
                                                 target.flatten().long())

        rdd = self.sc.range(0, 100)
        df = rdd.map(lambda x: ([float(x)] * 5,
                                [int(np.random.randint(0, 2, size=()))])).toDF(
                                    ["feature", "label"])

        with tempfile.TemporaryDirectory() as temp_dir_name:
            estimator = Estimator.from_torch(
                model=model,
                loss=loss_func,
                metrics=[Accuracy()],
                optimizer=SGD(learningrate_schedule=Default()),
                model_dir=temp_dir_name)
            estimator.fit(data=df,
                          epochs=4,
                          batch_size=2,
                          validation_data=df,
                          checkpoint_trigger=EveryEpoch(),
                          feature_cols=["feature"],
                          label_cols=["label"])
            eval_result = estimator.evaluate(df,
                                             batch_size=2,
                                             feature_cols=["feature"],
                                             label_cols=["label"])
            assert isinstance(eval_result, dict)