test_images, test_labels = load_dataset(directory, 't10k-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz') from pyspark.sql.types import * from pyspark.sql import SparkSession spark = SparkSession.builder.appName("create_mnist_td").getOrCreate() data = [(train_images[i].tolist(), int(test_labels[i])) for i in range(len(test_images))] schema = StructType([ StructField("image", ArrayType(FloatType())), StructField("label", LongType()) ]) df = spark.createDataFrame(data, schema) import hsfs connection = hsfs.connection() fs = connection.get_feature_store() training_dataset = fs.create_training_dataset(name=td, version=1, data_format='tfrecords', splits={ 'train': 0.7, 'test': 0.2, 'validate': 0.1 }, statistics_config=False) training_dataset.save(df) connection.close() spark.stop()
def get_feature_store_handle(feature_store: str = "") -> hsfs.feature_store: connection = hsfs.connection() return connection.get_feature_store(feature_store)
def connect_hsfs(self, engine="training"): return hsfs.connection(engine=engine)