예제 #1
0
 def __init__(self):
     """
     Maintain a long live spark session and context.
     """
     self._spark = Session()
     self.spark_session = self._spark.get_session()
     self.spark_context = self._spark.get_context()
예제 #2
0
    def test_session(self):

        spark_session = self.session.get_session()

        session2 = Session()
        self.assertEqual(self.session, session2)
        self.assertIsInstance(spark_session, SparkSession)
예제 #3
0
def create_dataframe(df_metadata: DataFrameMetadata):

    spark = Session().get_session()
    spark_context = Session().get_context()

    # Create an empty RDD
    empty_rdd = spark_context.emptyRDD()
    print("url", df_metadata.file_url)
    # Use petastorm to create dataframe
    with materialize_dataset(spark, df_metadata.file_url,
                             df_metadata.schema.petastorm_schema):

        spark.createDataFrame(empty_rdd,
                              df_metadata.schema.pyspark_schema) \
            .coalesce(1) \
            .write \
            .mode('overwrite') \
            .parquet(df_metadata.file_url)
예제 #4
0
def append_rows(df_metadata: DataFrameMetadata, rows):

    spark = Session().get_session()
    spark_context = Session().get_context()

    # Use petastorm to appends rows
    with materialize_dataset(spark, df_metadata.file_url,
                             df_metadata.schema.petastorm_schema):
        # Convert a list of rows to RDD
        rows_rdd = spark_context.parallelize(
            rows).map(lambda x: dict_to_spark_row(
                df_metadata.schema.petastorm_schema, x))

        spark.createDataFrame(rows_rdd,
                              df_metadata.schema.pyspark_schema) \
            .coalesce(1) \
            .write \
            .mode('append') \
            .parquet(df_metadata.file_url)
예제 #5
0
def append_rows(df_metadata: DataFrameMetadata, rows):

    spark = Session().get_session()
    Session().get_context()

    # Convert a list of rows to RDD
    rows_df = spark.createDataFrame(rows, df_metadata.schema.pyspark_schema)
    rows_rdd = rows_df.rdd

    # Use petastorm to appends rows
    with materialize_dataset(spark, df_metadata.file_url,
                             df_metadata.schema.petastorm_schema):

        spark.createDataFrame(rows_rdd,
                              df_metadata.schema.pyspark_schema) \
            .coalesce(1) \
            .write \
            .mode('append') \
            .parquet(df_metadata.file_url)
예제 #6
0
파일: frame_loader.py 프로젝트: swati21/eva
    def __init__(self, dataset_name: str, frame_metadata: FrameInfo):

        self.dataset_name = dataset_name
        self.H = frame_metadata.height
        self.W = frame_metadata.width
        self.C = frame_metadata.num_channels

        # The schema defines how the dataset schema looks like
        self.dataset_schema = Unischema(self.dataset_name, [
            UnischemaField('frame_id', np.int32,
                           (), ScalarCodec(IntegerType()), False),
            UnischemaField('frame_data', np.uint8, (self.H, self.W, self.C),
                           CompressedNdarrayCodec(), False),
        ])

        # Construct output location
        eva_dir = ConfigurationManager().get_value("core", "location")
        output_url = os.path.join(eva_dir, self.dataset_name)

        # Get session handle
        session = Session()
        spark = session.get_session()
        spark_context = session.get_context()

        # Wrap dataset materialization portion.
        rows_count = 10
        with materialize_dataset(spark, output_url, self.dataset_schema):

            rows_rdd = spark_context.parallelize(range(rows_count))\
                .map(lambda x: row_generator(x, self.H, self.W, self.C))\
                .map(lambda x: dict_to_spark_row(self.dataset_schema, x))

            spark.createDataFrame(rows_rdd,
                                  self.dataset_schema.as_spark_schema()) \
                .coalesce(10) \
                .write \
                .mode('overwrite') \
                .parquet(output_url)
예제 #7
0
def load_dataframe(dataframe_url: str):

    spark = Session().get_session()
    dataframe = spark.read.load(dataframe_url)

    return dataframe
예제 #8
0
 def tearDown(self):
     self.session = Session()
     self.session.stop()
예제 #9
0
 def setUp(self):
     suppress_py4j_logging()
     self.session = Session()
예제 #10
0
 def setUp(self):
     self.session = Session()