def _handle_dataframe(self, data, validation_data, feature_cols, label_cols): schema = data.schema train_rdd = data.rdd.map(lambda row: row_to_sample(row, schema, feature_cols, label_cols)) train_feature_set = FeatureSet.sample_rdd(train_rdd) if validation_data is None: val_feature_set = None else: assert isinstance(validation_data, DataFrame), "validation_data should also be a " \ "DataFrame" val_feature_set = FeatureSet.sample_rdd(validation_data.rdd.map( lambda row: row_to_sample(row, schema, feature_cols, label_cols))) return train_feature_set, val_feature_set
def predict(self, data, batch_size=4, feature_cols=None): """ Predict input data. :param data: data to be predicted. It can be an XShards or a Spark Dataframe. If it is an XShards, each partition is a dictionary of {'x': feature}, where feature is a numpy array or a list of numpy arrays. :param batch_size: batch size used for inference. :param feature_cols: Feature column name(s) of data. Only used when data is a Spark DataFrame. Default: None. :return: predicted result. The predict result is a XShards, each partition of the XShards is a dictionary of {'prediction': result}, where result is a numpy array or a list of numpy arrays. """ from zoo.orca.learn.utils import convert_predict_rdd_to_xshard if isinstance(data, SparkXShards): from zoo.orca.data.utils import xshard_to_sample data_rdd = data.rdd.flatMap(xshard_to_sample) elif isinstance(data, DataFrame): schema = data.schema data_rdd = data.rdd.map( lambda row: row_to_sample(row, schema, feature_cols, None)) else: raise ValueError( "Data should be XShards, each element needs to be {'x': a feature " "numpy array}.") predicted_rdd = self.model.predict(data_rdd, batch_size=batch_size) if isinstance(data, SparkXShards): result = convert_predict_rdd_to_xshard(data, predicted_rdd) else: result = convert_predict_rdd_to_dataframe(data, predicted_rdd) return result
def evaluate(self, data, batch_size=32, feature_cols=None, label_cols=None): from zoo.orca.data.utils import xshard_to_sample assert data is not None, "validation data shouldn't be None" assert self.metrics is not None, "metrics shouldn't be None, please specify the metrics" \ " argument when creating this estimator." if isinstance(data, SparkXShards): val_feature_set = FeatureSet.sample_rdd( data.rdd.flatMap(xshard_to_sample)) result = self.estimator.evaluate(val_feature_set, self.metrics, batch_size) elif isinstance(data, DataFrame): schema = data.schema val_feature_set = FeatureSet.sample_rdd( data.rdd.map(lambda row: row_to_sample( row, schema, feature_cols, label_cols))) result = self.estimator.evaluate(val_feature_set, self.metrics, batch_size) elif isinstance(data, DataLoader) or callable(data): val_feature_set = FeatureSet.pytorch_dataloader(data) result = self.estimator.evaluate_minibatch(val_feature_set, self.metrics) else: raise ValueError( "Data should be a SparkXShards, a DataLoader or a callable " "data_creator, but get " + data.__class__.__name__) return bigdl_metric_results_to_dict(result)
def evaluate(self, data, batch_size=32, feature_cols=None, label_cols=None, validation_metrics=None): """ Evaluate model. :param data: data: evaluation data. It can be an XShards, Spark Dataframe, PyTorch DataLoader and PyTorch DataLoader creator function. If data is an XShards, each partition can be a Pandas DataFrame or a dictionary of {'x': feature, 'y': label}, where feature(label) is a numpy array or a list of numpy arrays. :param batch_size: Batch size used for evaluation. Only used when data is a SparkXShard. :param feature_cols: Feature column name(s) of data. Only used when data is a Spark DataFrame or an XShards of Pandas DataFrame. Default: None. :param label_cols: Label column name(s) of data. Only used when data is a Spark DataFrame or an XShards of Pandas DataFrame. Default: None. :param validation_metrics: Orca validation metrics to be computed on validation_data. :return: validation results. """ from zoo.orca.data.utils import xshard_to_sample assert data is not None, "validation data shouldn't be None" assert self.metrics is not None, "metrics shouldn't be None, please specify the metrics" \ " argument when creating this estimator." if isinstance(data, SparkXShards): if data._get_class_name() == 'pandas.core.frame.DataFrame': data = process_xshards_of_pandas_dataframe( data, feature_cols, label_cols) val_feature_set = FeatureSet.sample_rdd( data.rdd.flatMap(xshard_to_sample)) result = self.estimator.evaluate(val_feature_set, self.metrics, batch_size) elif isinstance(data, DataFrame): schema = data.schema val_feature_set = FeatureSet.sample_rdd( data.rdd.map(lambda row: row_to_sample( row, schema, feature_cols, label_cols))) result = self.estimator.evaluate(val_feature_set, self.metrics, batch_size) elif isinstance(data, DataLoader) or callable(data): val_feature_set = FeatureSet.pytorch_dataloader(data) result = self.estimator.evaluate_minibatch(val_feature_set, self.metrics) else: raise ValueError( "Data should be a SparkXShards, a DataLoader or a callable " "data_creator, but get " + data.__class__.__name__) return bigdl_metric_results_to_dict(result)
def predict(self, data, batch_size=4, feature_cols=None): from zoo.orca.learn.utils import convert_predict_rdd_to_xshard if isinstance(data, SparkXShards): from zoo.orca.data.utils import xshard_to_sample data_rdd = data.rdd.flatMap(xshard_to_sample) elif isinstance(data, DataFrame): schema = data.schema data_rdd = data.rdd.map( lambda row: row_to_sample(row, schema, feature_cols, None)) else: raise ValueError( "Data should be XShards, each element needs to be {'x': a feature " "numpy array}.") predicted_rdd = self.model.predict(data_rdd, batch_size=batch_size) if isinstance(data, SparkXShards): result = convert_predict_rdd_to_xshard(data, predicted_rdd) else: result = convert_predict_rdd_to_dataframe(data, predicted_rdd) return result