def test_train_model_with_bn(self): class SimpleTorchModel(nn.Module): def __init__(self): super(SimpleTorchModel, self).__init__() self.dense1 = nn.Linear(2, 4) self.bn1 = torch.nn.BatchNorm1d(4) self.dense2 = nn.Linear(4, 1) def forward(self, x): x = self.dense1(x) x = self.bn1(x) x = torch.sigmoid(self.dense2(x)) return x torch_model = SimpleTorchModel() loss_fn = torch.nn.BCELoss() az_model = TorchModel.from_pytorch(torch_model) zoo_loss = TorchLoss.from_pytorch(loss_fn) inputs = torch.Tensor([[1, 2], [1, 3], [3, 2], [5, 6], [8, 9], [1, 9]]) targets = torch.Tensor([[0], [0], [0], [1], [1], [1]]) train_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2) train_featureset = FeatureSet.pytorch_dataloader(train_loader) val_loader = DataLoader(TensorDataset(inputs, targets), batch_size=2) val_featureset = FeatureSet.pytorch_dataloader(val_loader) zooOptimizer = Adam() estimator = Estimator(az_model, optim_methods=zooOptimizer) estimator.train_minibatch(train_featureset, zoo_loss, end_trigger=MaxEpoch(4), checkpoint_trigger=EveryEpoch(), validation_set=val_featureset, validation_method=[Accuracy()]) trained_model = az_model.to_pytorch()
def evaluate(self, data, batch_size=32, feature_cols=None, label_cols=None): from zoo.orca.data.utils import xshard_to_sample assert data is not None, "validation data shouldn't be None" assert self.metrics is not None, "metrics shouldn't be None, please specify the metrics" \ " argument when creating this estimator." if isinstance(data, SparkXShards): val_feature_set = FeatureSet.sample_rdd( data.rdd.flatMap(xshard_to_sample)) result = self.estimator.evaluate(val_feature_set, self.metrics, batch_size) elif isinstance(data, DataFrame): schema = data.schema val_feature_set = FeatureSet.sample_rdd( data.rdd.map(lambda row: row_to_sample( row, schema, feature_cols, label_cols))) result = self.estimator.evaluate(val_feature_set, self.metrics, batch_size) elif isinstance(data, DataLoader) or callable(data): val_feature_set = FeatureSet.pytorch_dataloader(data) result = self.estimator.evaluate_minibatch(val_feature_set, self.metrics) else: raise ValueError( "Data should be a SparkXShards, a DataLoader or a callable " "data_creator, but get " + data.__class__.__name__) return bigdl_metric_results_to_dict(result)
def evaluate(self, data, batch_size=32, feature_cols=None, labels_cols=None, validation_metrics=None): from zoo.orca.data.utils import to_sample from zoo.orca.learn.metrics import Metrics assert data is not None, "validation data shouldn't be None" validation_metrics = Metrics.convert_metrics_list(validation_metrics) if isinstance(data, SparkXShards): val_feature_set = FeatureSet.sample_rdd( data.rdd.flatMap(to_sample)) return self.estimator.evaluate(val_feature_set, validation_metrics, batch_size) elif isinstance(data, DataLoader) or callable(data): val_feature_set = FeatureSet.pytorch_dataloader(data) return self.estimator.evaluate_minibatch(val_feature_set, validation_metrics) else: raise ValueError( "Data should be a SparkXShards, a DataLoader or a callable " "data_creator, but get " + data.__class__.__name__)
def fit(self, data, epochs=1, batch_size=32, validation_data=None, validation_methods=None, checkpoint_trigger=None): from zoo.orca.data.utils import to_sample end_trigger = MaxEpoch(epochs) assert batch_size > 0, "batch_size should be greater than 0" if isinstance(data, SparkXShards): train_rdd = data.rdd.flatMap(to_sample) train_feature_set = FeatureSet.sample_rdd(train_rdd) if validation_data is None: val_feature_set = None else: assert isinstance(validation_data, SparkXShards), "validation_data should be a " \ "SparkXShards" val_feature_set = FeatureSet.sample_rdd(validation_data.rdd.flatMap(to_sample)) self.estimator.train(train_feature_set, self.loss, end_trigger, checkpoint_trigger, val_feature_set, validation_methods, batch_size) elif isinstance(data, DataLoader) or callable(data): train_feature_set = FeatureSet.pytorch_dataloader(data, "", "") if validation_data is None: val_feature_set = None else: assert isinstance(validation_data, DataLoader) or callable(data), \ "validation_data should be a pytorch DataLoader or a callable data_creator" val_feature_set = FeatureSet.pytorch_dataloader(validation_data) self.estimator.train_minibatch(train_feature_set, self.loss, end_trigger, checkpoint_trigger, val_feature_set, validation_methods) else: raise ValueError("Data and validation data should be SparkXShards, DataLoaders or " "callable data_creators but get " + data.__class__.__name__) return self
def _handle_xshards(self, data, validation_data): train_rdd = data.rdd.flatMap(xshard_to_sample) train_feature_set = FeatureSet.sample_rdd(train_rdd) if validation_data is None: val_feature_set = None else: assert isinstance(validation_data, SparkXShards), "validation_data should be a " \ "SparkXShards" val_feature_set = FeatureSet.sample_rdd(validation_data.rdd.flatMap(xshard_to_sample)) return train_feature_set, val_feature_set
def _hanle_data_loader(self, data, validation_data): train_feature_set = FeatureSet.pytorch_dataloader(data, "", "") if validation_data is None: val_feature_set = None else: assert isinstance(validation_data, DataLoader) or callable(data), \ "validation_data should be a pytorch DataLoader or a callable data_creator" val_feature_set = FeatureSet.pytorch_dataloader(validation_data) return train_feature_set, val_feature_set
def fit(self, data, epochs=1, batch_size=32, feature_cols=None, labels_cols=None, validation_data=None, validation_metrics=None, checkpoint_trigger=None): from zoo.orca.data.utils import to_sample from zoo.orca.learn.metrics import Metrics from zoo.orca.learn.trigger import Trigger end_trigger = MaxEpoch(epochs) assert batch_size > 0, "batch_size should be greater than 0" validation_metrics = Metrics.convert_metrics_list(validation_metrics) checkpoint_trigger = Trigger.convert_trigger(checkpoint_trigger) if self.log_dir is not None and self.app_name is not None: self.estimator.set_tensorboard(self.log_dir, self.app_name) if isinstance(data, SparkXShards): train_rdd = data.rdd.flatMap(to_sample) train_feature_set = FeatureSet.sample_rdd(train_rdd) if validation_data is None: val_feature_set = None else: assert isinstance(validation_data, SparkXShards), "validation_data should be a " \ "SparkXShards" val_feature_set = FeatureSet.sample_rdd( validation_data.rdd.flatMap(to_sample)) self.estimator.train(train_feature_set, self.loss, end_trigger, checkpoint_trigger, val_feature_set, validation_metrics, batch_size) elif isinstance(data, DataLoader) or callable(data): train_feature_set = FeatureSet.pytorch_dataloader(data, "", "") if validation_data is None: val_feature_set = None else: assert isinstance(validation_data, DataLoader) or callable(data), \ "validation_data should be a pytorch DataLoader or a callable data_creator" val_feature_set = FeatureSet.pytorch_dataloader( validation_data) self.estimator.train_minibatch(train_feature_set, self.loss, end_trigger, checkpoint_trigger, val_feature_set, validation_metrics) else: raise ValueError( "Data and validation data should be SparkXShards, DataLoaders or " "callable data_creators but get " + data.__class__.__name__) return self
def evaluate(self, data, batch_size=32, feature_cols=None, label_cols=None, validation_metrics=None): """ Evaluate model. :param data: data: evaluation data. It can be an XShards, Spark Dataframe, PyTorch DataLoader and PyTorch DataLoader creator function. If data is an XShards, each partition can be a Pandas DataFrame or a dictionary of {'x': feature, 'y': label}, where feature(label) is a numpy array or a list of numpy arrays. :param batch_size: Batch size used for evaluation. Only used when data is a SparkXShard. :param feature_cols: Feature column name(s) of data. Only used when data is a Spark DataFrame or an XShards of Pandas DataFrame. Default: None. :param label_cols: Label column name(s) of data. Only used when data is a Spark DataFrame or an XShards of Pandas DataFrame. Default: None. :param validation_metrics: Orca validation metrics to be computed on validation_data. :return: validation results. """ from zoo.orca.data.utils import xshard_to_sample assert data is not None, "validation data shouldn't be None" assert self.metrics is not None, "metrics shouldn't be None, please specify the metrics" \ " argument when creating this estimator." if isinstance(data, SparkXShards): if data._get_class_name() == 'pandas.core.frame.DataFrame': data = process_xshards_of_pandas_dataframe( data, feature_cols, label_cols) val_feature_set = FeatureSet.sample_rdd( data.rdd.flatMap(xshard_to_sample)) result = self.estimator.evaluate(val_feature_set, self.metrics, batch_size) elif isinstance(data, DataFrame): schema = data.schema val_feature_set = FeatureSet.sample_rdd( data.rdd.map(lambda row: row_to_sample( row, schema, feature_cols, label_cols))) result = self.estimator.evaluate(val_feature_set, self.metrics, batch_size) elif isinstance(data, DataLoader) or callable(data): val_feature_set = FeatureSet.pytorch_dataloader(data) result = self.estimator.evaluate_minibatch(val_feature_set, self.metrics) else: raise ValueError( "Data should be a SparkXShards, a DataLoader or a callable " "data_creator, but get " + data.__class__.__name__) return bigdl_metric_results_to_dict(result)
def _handle_dataframe(self, data, validation_data, feature_cols, label_cols): schema = data.schema train_rdd = data.rdd.map(lambda row: row_to_sample(row, schema, feature_cols, label_cols)) train_feature_set = FeatureSet.sample_rdd(train_rdd) if validation_data is None: val_feature_set = None else: assert isinstance(validation_data, DataFrame), "validation_data should also be a " \ "DataFrame" val_feature_set = FeatureSet.sample_rdd(validation_data.rdd.map( lambda row: row_to_sample(row, schema, feature_cols, label_cols))) return train_feature_set, val_feature_set
def test_estimator_train(self): batch_size = 8 epoch_num = 5 images, labels = TestEstimator._generate_image_data(data_num=8, img_shape=(3, 224, 224)) image_rdd = self.sc.parallelize(images) labels = self.sc.parallelize(labels) sample_rdd = image_rdd.zip(labels).map( lambda img_label: zoo.common.Sample.from_ndarray( img_label[0], img_label[1])) data_set = FeatureSet.sample_rdd(sample_rdd) model = TestEstimator._create_cnn_model() optim_method = SGD(learningrate=0.01) estimator = Estimator(model, optim_method, "") estimator.set_constant_gradient_clipping(0.1, 1.2) estimator.train(train_set=data_set, criterion=ClassNLLCriterion(), end_trigger=MaxEpoch(epoch_num), checkpoint_trigger=EveryEpoch(), validation_set=data_set, validation_method=[Top1Accuracy()], batch_size=batch_size) predict_result = model.predict(sample_rdd) assert (predict_result.count(), 8)
def evaluate(self, data, batch_size=32, feature_cols=None, label_cols=None): """ Evaluate model. :param data: validation data. It can be XShards, each partition is a dictionary of {'x': feature, 'y': label}, where feature(label) is a numpy array or a list of numpy arrays. :param batch_size: Batch size used for validation. Default: 32. :param feature_cols: (Not supported yet) Feature column name(s) of data. Only used when data is a Spark DataFrame. Default: None. :param label_cols: (Not supported yet) Label column name(s) of data. Only used when data is a Spark DataFrame. Default: None. :return: """ assert data is not None, "validation data shouldn't be None" assert self.metrics is not None, "metrics shouldn't be None, please specify the metrics" \ " argument when creating this estimator." if isinstance(data, DataFrame): raise NotImplementedError elif isinstance(data, SparkXShards): from zoo.orca.data.utils import xshard_to_sample val_feature_set = FeatureSet.sample_rdd( data.rdd.flatMap(xshard_to_sample)) result = self.estimator.evaluate(val_feature_set, self.metrics, batch_size) else: raise ValueError( "Data should be XShards or Spark DataFrame, but get " + data.__class__.__name__) return bigdl_metric_results_to_dict(result)
def _get_training_data(self): jvalue = callZooFunc("float", "createTFDataFeatureSet", self.rdd.map(lambda x: x[0]), self.init_op_name, self.table_init_op, self.output_names, self.output_types, self.shard_index_op_name, self.inter_threads, self.intra_threads) return FeatureSet(jvalue=jvalue)
def input_fn(mode): if mode == tf.estimator.ModeKeys.TRAIN: image_set = self.get_raw_image_set(with_label=True) feature_set = FeatureSet.image_frame( image_set.to_image_frame()) train_transformer = ChainedPreprocessing([ ImageBytesToMat(), ImageResize(256, 256), ImageRandomCrop(224, 224), ImageRandomPreprocessing(ImageHFlip(), 0.5), ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224, 0.225), ImageMatToTensor(to_RGB=True, format="NHWC"), ImageSetToSample(input_keys=["imageTensor"], target_keys=["label"]) ]) feature_set = feature_set.transform(train_transformer) feature_set = feature_set.transform(ImageFeatureToSample()) training_dataset = TFDataset.from_feature_set( feature_set, features=(tf.float32, [224, 224, 3]), labels=(tf.int32, [1]), batch_size=8) return training_dataset else: raise NotImplementedError
def get_training_data(self): sample_rdd = self.rdd.map( lambda t: Sample.from_ndarray(nest.flatten(t), np.array([0.0]))) fs = FeatureSet.sample_rdd(sample_rdd, sequential_order=self.sequential_order, shuffle=self.shuffle) return fs
def get_training_data(self): jvalue = callZooFunc("float", "createMiniBatchRDDFromStringRDD", self.train_rdd, self.batch_size) rdd = jvalue.value().toJavaRDD() fs = FeatureSet.rdd(rdd, sequential_order=self.sequential_order, shuffle=self.shuffle) return fs
def _get_validation_data(self): if self.validation_dataset is not None: jvalue = callZooFunc("float", "createTFDataFeatureSet", self.val_rdd.map(lambda x: x[0]), self.init_op_name, self.table_init_op, self.output_names, self.output_types, self.shard_index_op_name) return FeatureSet(jvalue=jvalue) return None
def get_training_data(self): sample_rdd = self.text_set.get_samples().map( lambda sample: Sample.from_jtensor( features=sample.features + sample.labels, labels=JTensor.from_ndarray(np.array([0.0])))) return FeatureSet.sample_rdd(sample_rdd, sequential_order=self.sequential_order, shuffle=self.shuffle)
def get_training_data(self): fs = FeatureSet.image_set(self.image_set, sequential_order=self.sequential_order, shuffle=self.shuffle) fs = fs.transform(MergeFeatureLabelImagePreprocessing()) fs = fs.transform(ImageFeatureToSample()) return fs
def get_validation_data(self): if self.validation_rdd is not None: jvalue = callZooFunc("float", "createMiniBatchFeatureSetFromStringRDD", self.validation_rdd, self.batch_size, self.sequential_order, self.shuffle) fs = FeatureSet(jvalue) return fs return None
def evaluate(self, data, batch_size=32, feature_cols="features", label_cols="label"): """ Evaluate model. :param data: validation data. It can be XShardsor or Spark DataFrame, each partition is a dictionary of {'x': feature, 'y': label}, where feature(label) is a numpy array or a list of numpy arrays. :param batch_size: Batch size used for validation. Default: 32. :param feature_cols: (Not supported yet) Feature column name(s) of data. Only used when data is a Spark DataFrame. Default: None. :param label_cols: (Not supported yet) Label column name(s) of data. Only used when data is a Spark DataFrame. Default: None. :return: """ assert data is not None, "validation data shouldn't be None" assert self.metrics is not None, "metrics shouldn't be None, please specify the metrics" \ " argument when creating this estimator." if isinstance(data, DataFrame): if isinstance(feature_cols, list): data, _, feature_cols = \ BigDLEstimator._combine_cols(data, [feature_cols], col_name="features") if isinstance(label_cols, list): data, _, label_cols = \ BigDLEstimator._combine_cols(data, label_cols, col_name="label") self.nn_estimator._setNNBatchSize(batch_size)._setNNFeaturesCol(feature_cols) \ ._setNNLabelCol(label_cols) self.nn_estimator.setValidation(None, None, self.metrics, batch_size) if self.log_dir is not None and self.app_name is not None: from bigdl.optim.optimizer import TrainSummary from bigdl.optim.optimizer import ValidationSummary val_summary = ValidationSummary(log_dir=self.log_dir, app_name=self.app_name) self.nn_estimator.setValidationSummary(val_summary) result = self.nn_estimator._eval(data) elif isinstance(data, SparkXShards): from zoo.orca.data.utils import xshard_to_sample val_feature_set = FeatureSet.sample_rdd( data.rdd.flatMap(xshard_to_sample)) result = self.estimator.evaluate(val_feature_set, self.metrics, batch_size) else: raise ValueError( "Data should be XShards or Spark DataFrame, but get " + data.__class__.__name__) return bigdl_metric_results_to_dict(result)
def get_validation_data(self): if self.val_rdd is not None: sample_rdd = self.val_rdd.map(lambda t: Sample.from_ndarray( nest.flatten(t), np.array([0.0]))) return FeatureSet.sample_rdd( sample_rdd, sequential_order=self.sequential_order, shuffle=self.shuffle) return None
def get_validation_data(self): if self.val_rdd is not None: sample_rdd = self.val_rdd.map(lambda t: Sample.from_ndarray( nest.flatten(t), np.array([0.0]))) fs = FeatureSet.sample_rdd(sample_rdd, sequential_order=self.sequential_order, shuffle=self.shuffle) fs = fs.transform(SampleToMiniBatch(self.batch_size)) return fs return None
def get_validation_data(self): if self.validation_rdd is not None: jvalue = callZooFunc("float", "createMiniBatchRDDFromStringRDD", self.validation_rdd, self.batch_size) rdd = jvalue.value().toJavaRDD() fs = FeatureSet.rdd(rdd, sequential_order=self.sequential_order, shuffle=self.shuffle) return fs return None
def get_validation_data(self): if self.validation_image_set is not None: fs = FeatureSet.image_set(self.validation_image_set, sequential_order=self.sequential_order, shuffle=self.shuffle) fs = fs.transform(MergeFeatureLabelImagePreprocessing()) fs = fs.transform(ImageFeatureToSample()) fs = fs.transform(SampleToMiniBatch(self.batch_size)) return fs return None
def get_validation_data(self): if self.validation_image_set is not None: fs = FeatureSet.image_set( self.validation_image_set, sequential_order=self.sequential_order, shuffle=self.shuffle).transform([ MergeFeatureLabelImagePreprocessing(), ImageFeatureToSample() ]) return fs return None
def get_validation_data(self): if self.validation_text_set is not None: sample_rdd = self.validation_text_set.get_samples().map( lambda sample: Sample.from_jtensor( features=sample.features + sample.labels, labels=JTensor.from_ndarray(np.array([0.0])))) return FeatureSet.sample_rdd( sample_rdd, sequential_order=self.sequential_order, shuffle=self.shuffle) return None
def get_featureset(x, y, shuffle=True): x = np.split(x.data.numpy(), x.shape[0]) y = np.split(y.data.numpy(), y.shape[0]) print(x[0].shape) print(y[0].shape) samples = [ Sample.from_ndarray(np.squeeze(x[i]), np.squeeze(y[i])) for i in range(len(x)) ] sample_rdd = sc.parallelize(samples) return FeatureSet.sample_rdd(sample_rdd, shuffle=shuffle)
def get_validation_data(self): if self.validation_text_set is not None: sample_rdd = self.validation_text_set.get_samples().map( lambda sample: Sample.from_jtensor( features=sample.features + sample.labels, labels=JTensor.from_ndarray(np.array([0.0])))) fs = FeatureSet.sample_rdd(sample_rdd, sequential_order=self.sequential_order, shuffle=self.shuffle) fs = fs.transform(SampleToMiniBatch(self.batch_size)) return fs return None
def create_train_features_Set(self): image_set = self.get_raw_image_set(with_label=True) feature_set = FeatureSet.image_frame(image_set.to_image_frame()) train_transformer = ChainedPreprocessing([ ImageBytesToMat(), ImageResize(256, 256), ImageRandomCrop(224, 224), ImageRandomPreprocessing(ImageHFlip(), 0.5), ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224, 0.225), ImageMatToTensor(to_RGB=True, format="NHWC"), ImageSetToSample(input_keys=["imageTensor"], target_keys=["label"]) ]) feature_set = feature_set.transform(train_transformer) return feature_set
def evaluate(self, data, batch_size=32, feature_cols=None, label_cols=None): assert data is not None, "validation data shouldn't be None" assert self.metrics is not None, "metrics shouldn't be None, please specify the metrics" \ " argument when creating this estimator." if isinstance(data, DataFrame): raise NotImplementedError elif isinstance(data, SparkXShards): from zoo.orca.data.utils import xshard_to_sample from zoo.orca.learn.metrics import Metrics val_feature_set = FeatureSet.sample_rdd(data.rdd.flatMap(xshard_to_sample)) result = self.estimator.evaluate(val_feature_set, self.metrics, batch_size) else: raise ValueError("Data should be XShards or Spark DataFrame, but get " + data.__class__.__name__) return bigdl_metric_results_to_dict(result)