def __init__(self, model, loss, optimizer, metrics=None, model_dir=None, bigdl_type="float"): from zoo.pipeline.api.torch import TorchModel, TorchLoss, TorchOptim self.loss = loss if self.loss is None: self.loss = TorchLoss() else: self.loss = TorchLoss.from_pytorch(loss) if optimizer is None: from zoo.orca.learn.optimizers.schedule import Default optimizer = SGD(learningrate_schedule=Default()) if isinstance(optimizer, TorchOptimizer): optimizer = TorchOptim.from_pytorch(optimizer) elif isinstance(optimizer, OrcaOptimizer): optimizer = optimizer.get_optimizer() else: raise ValueError( "Only PyTorch optimizer and orca optimizer are supported") from zoo.orca.learn.metrics import Metric self.metrics = Metric.convert_metrics_list(metrics) self.log_dir = None self.app_name = None self.model_dir = model_dir self.model = TorchModel.from_pytorch(model) self.estimator = SparkEstimator(self.model, optimizer, model_dir, bigdl_type=bigdl_type)
def test_bigdl_pytorch_estimator_dataframe_predict(self): def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) class IdentityNet(nn.Module): def __init__(self): super().__init__() # need this line to avoid optimizer raise empty variable list self.fc1 = nn.Linear(5, 5) def forward(self, input_): return input_ model = IdentityNet() rdd = self.sc.range(0, 100) df = rdd.map(lambda x: ([float(x)] * 5, [int(np.random.randint(0, 2, size=()))])).toDF( ["feature", "label"]) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_torch( model=model, loss=loss_func, optimizer=SGD(learningrate_schedule=Default()), model_dir=temp_dir_name) result = estimator.predict(df, feature_cols=["feature"]) expr = "sum(cast(feature <> to_array(prediction) as int)) as error" assert result.selectExpr(expr).first()["error"] == 0
def test_bigdl_pytorch_estimator_shard(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(2, 2) def forward(self, x): x = self.fc(x) return F.log_softmax(x, dim=1) model = SimpleModel() def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) def transform(df): result = { "x": np.stack([df['user'].to_numpy(), df['item'].to_numpy()], axis=1), "y": df['label'].to_numpy() } return result def transform_del_y(d): result = {"x": d["x"]} return result OrcaContext.pandas_read_backend = "pandas" file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = read_csv(file_path) data_shard = data_shard.transform_shard(transform) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_torch(model=model, loss=loss_func, metrics=[Accuracy()], optimizer=SGD(learningrate_schedule=Default()), model_dir=temp_dir_name) estimator.fit(data=data_shard, epochs=4, batch_size=2, validation_data=data_shard, checkpoint_trigger=EveryEpoch()) estimator.evaluate(data_shard, batch_size=2) est2 = Estimator.from_torch(model=model, loss=loss_func, metrics=[Accuracy()], optimizer=None) est2.load(temp_dir_name, loss=loss_func) est2.fit(data=data_shard, epochs=8, batch_size=2, validation_data=data_shard, checkpoint_trigger=EveryEpoch()) est2.evaluate(data_shard, batch_size=2) pred_result = est2.predict(data_shard) pred_c = pred_result.collect() assert(pred_result, SparkXShards) pred_shard = data_shard.transform_shard(transform_del_y) pred_result2 = est2.predict(pred_shard) pred_c_2 = pred_result2.collect() assert (pred_c[0]["prediction"] == pred_c_2[0]["prediction"]).all()
def test_bigdl_pytorch_estimator_pandas_dataframe(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(1, 10) def forward(self, x): x = torch.unsqueeze(x, dim=1) x = self.fc(x) return F.log_softmax(x, dim=1) def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) model = SimpleModel() OrcaContext.pandas_read_backend = "pandas" file_path = os.path.join(resource_path, "orca/learn/simple_feature_label.csv") data_shard = read_csv(file_path) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_torch( model=model, loss=loss_func, metrics=[Accuracy()], optimizer=SGD(learningrate_schedule=Default()), model_dir=temp_dir_name) estimator.fit(data=data_shard, epochs=1, batch_size=4, feature_cols=['feature'], label_cols=['label'], validation_data=data_shard, checkpoint_trigger=EveryEpoch()) estimator.evaluate(data_shard, batch_size=4, feature_cols=['feature'], label_cols=['label']) est2 = Estimator.from_torch(model=model, loss=loss_func, metrics=[Accuracy()], optimizer=None) est2.load_orca_checkpoint(temp_dir_name) est2.predict(data_shard, batch_size=4, feature_cols=['feature'])
def test_bigdl_pytorch_estimator_dataframe_fit_evaluate(self): class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(5, 5) def forward(self, x): x = self.fc(x) return F.log_softmax(x, dim=1) model = SimpleModel() def loss_func(input, target): return nn.CrossEntropyLoss().forward(input, target.flatten().long()) rdd = self.sc.range(0, 100) df = rdd.map(lambda x: ([float(x)] * 5, [int(np.random.randint(0, 2, size=()))])).toDF( ["feature", "label"]) with tempfile.TemporaryDirectory() as temp_dir_name: estimator = Estimator.from_torch( model=model, loss=loss_func, metrics=[Accuracy()], optimizer=SGD(learningrate_schedule=Default()), model_dir=temp_dir_name) estimator.fit(data=df, epochs=4, batch_size=2, validation_data=df, checkpoint_trigger=EveryEpoch(), feature_cols=["feature"], label_cols=["label"]) eval_result = estimator.evaluate(df, batch_size=2, feature_cols=["feature"], label_cols=["label"]) assert isinstance(eval_result, dict)