def __init__(self): """ Maintain a long live spark session and context. """ self._spark = Session() self.spark_session = self._spark.get_session() self.spark_context = self._spark.get_context()
def test_session(self): spark_session = self.session.get_session() session2 = Session() self.assertEqual(self.session, session2) self.assertIsInstance(spark_session, SparkSession)
def create_dataframe(df_metadata: DataFrameMetadata): spark = Session().get_session() spark_context = Session().get_context() # Create an empty RDD empty_rdd = spark_context.emptyRDD() print("url", df_metadata.file_url) # Use petastorm to create dataframe with materialize_dataset(spark, df_metadata.file_url, df_metadata.schema.petastorm_schema): spark.createDataFrame(empty_rdd, df_metadata.schema.pyspark_schema) \ .coalesce(1) \ .write \ .mode('overwrite') \ .parquet(df_metadata.file_url)
def append_rows(df_metadata: DataFrameMetadata, rows): spark = Session().get_session() spark_context = Session().get_context() # Use petastorm to appends rows with materialize_dataset(spark, df_metadata.file_url, df_metadata.schema.petastorm_schema): # Convert a list of rows to RDD rows_rdd = spark_context.parallelize( rows).map(lambda x: dict_to_spark_row( df_metadata.schema.petastorm_schema, x)) spark.createDataFrame(rows_rdd, df_metadata.schema.pyspark_schema) \ .coalesce(1) \ .write \ .mode('append') \ .parquet(df_metadata.file_url)
def append_rows(df_metadata: DataFrameMetadata, rows): spark = Session().get_session() Session().get_context() # Convert a list of rows to RDD rows_df = spark.createDataFrame(rows, df_metadata.schema.pyspark_schema) rows_rdd = rows_df.rdd # Use petastorm to appends rows with materialize_dataset(spark, df_metadata.file_url, df_metadata.schema.petastorm_schema): spark.createDataFrame(rows_rdd, df_metadata.schema.pyspark_schema) \ .coalesce(1) \ .write \ .mode('append') \ .parquet(df_metadata.file_url)
def __init__(self, dataset_name: str, frame_metadata: FrameInfo): self.dataset_name = dataset_name self.H = frame_metadata.height self.W = frame_metadata.width self.C = frame_metadata.num_channels # The schema defines how the dataset schema looks like self.dataset_schema = Unischema(self.dataset_name, [ UnischemaField('frame_id', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('frame_data', np.uint8, (self.H, self.W, self.C), CompressedNdarrayCodec(), False), ]) # Construct output location eva_dir = ConfigurationManager().get_value("core", "location") output_url = os.path.join(eva_dir, self.dataset_name) # Get session handle session = Session() spark = session.get_session() spark_context = session.get_context() # Wrap dataset materialization portion. rows_count = 10 with materialize_dataset(spark, output_url, self.dataset_schema): rows_rdd = spark_context.parallelize(range(rows_count))\ .map(lambda x: row_generator(x, self.H, self.W, self.C))\ .map(lambda x: dict_to_spark_row(self.dataset_schema, x)) spark.createDataFrame(rows_rdd, self.dataset_schema.as_spark_schema()) \ .coalesce(10) \ .write \ .mode('overwrite') \ .parquet(output_url)
def load_dataframe(dataframe_url: str): spark = Session().get_session() dataframe = spark.read.load(dataframe_url) return dataframe
def tearDown(self): self.session = Session() self.session.stop()
def setUp(self): suppress_py4j_logging() self.session = Session()
def setUp(self): self.session = Session()