Пример #1
0
test_images, test_labels = load_dataset(directory, 't10k-images-idx3-ubyte.gz',
                                        't10k-labels-idx1-ubyte.gz')

from pyspark.sql.types import *
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("create_mnist_td").getOrCreate()
data = [(train_images[i].tolist(), int(test_labels[i]))
        for i in range(len(test_images))]
schema = StructType([
    StructField("image", ArrayType(FloatType())),
    StructField("label", LongType())
])
df = spark.createDataFrame(data, schema)

import hsfs

connection = hsfs.connection()
fs = connection.get_feature_store()
training_dataset = fs.create_training_dataset(name=td,
                                              version=1,
                                              data_format='tfrecords',
                                              splits={
                                                  'train': 0.7,
                                                  'test': 0.2,
                                                  'validate': 0.1
                                              },
                                              statistics_config=False)
training_dataset.save(df)
connection.close()
spark.stop()
Пример #2
0
def get_feature_store_handle(feature_store: str = "") -> hsfs.feature_store:
    connection = hsfs.connection()
    return connection.get_feature_store(feature_store)
Пример #3
0
 def connect_hsfs(self, engine="training"):
     return hsfs.connection(engine=engine)