예제 #1
0
 def test_session_create_data_frame_from_list(self):
     df = self.spark.createDataFrame([(1, 'one'), (2, 'two'), (3, 'three')])
     self.assertEqual(df.count(), 3)
     self.assertListEqual(
         df.collect(), [Row(_1=1, _2='one'), Row(_1=2, _2='two'), Row(_1=3, _2='three')],
     )
     self.assertEqual(
         df.schema, StructType([StructField('_1', LongType(), True), StructField('_2', StringType(), True)]),
     )
 def test_cast_row_to_string(self):
     self.assertEqual(
         cast_to_string(
             Row(
                 a=collections.OrderedDict([('value', None), ('b', {
                     'c': 7
                 })]),
                 b=None,
                 c=True,
                 d=5.2,
             ),
             StructType([
                 StructField(
                     'a',
                     MapType(
                         StringType(),
                         MapType(StringType(), LongType(), True),
                         True,
                     ),
                     True,
                 ),
                 StructField('b', LongType(), True),
                 StructField('c', BooleanType(), True),
                 StructField('d', DoubleType(), True),
             ]),
             options=BASE_OPTIONS,
         ),
         '[[value ->, b -> [c -> 7]],, true, 5.2]',
     )
예제 #3
0
    def test_session_create_data_frame_from_pandas_data_frame(self):
        try:
            # Pandas is an optional dependency
            # pylint: disable=import-outside-toplevel
            import pandas as pd
        except ImportError:
            raise Exception('pandas is not importable')

        pdf = pd.DataFrame([(1, 'one'), (2, 'two'), (3, 'three')])

        df = self.spark.createDataFrame(pdf)

        self.assertEqual(df.count(), 3)
        self.assertListEqual(
            df.collect(), [Row(**{'0': 1, '1': 'one'}), Row(**{'0': 2, '1': 'two'}), Row(**{'0': 3, '2': 'three'})],
        )
        self.assertEqual(
            df.schema, StructType([StructField('0', LongType(), True), StructField('1', StringType(), True)]),
        )
 def test_cast_to_struct(self):
     self.assertEqual(
         cast_to_struct(
             Row(character='Alice', day='28', month='8', year='2019'),
             from_type=StructType(fields=[
                 StructField('character', StringType()),
                 StructField('day', StringType()),
                 StructField('month', StringType()),
                 StructField('year', StringType()),
             ]),
             to_type=StructType(fields=[
                 StructField('character', StringType()),
                 StructField('day', IntegerType()),
                 StructField('month', IntegerType()),
                 StructField('year', IntegerType()),
             ]),
             options=BASE_OPTIONS,
         ),
         Row(character='Alice', day=28, month=8, year=2019),
     )
예제 #5
0
        def create_counts_row(col1Item, rows):
            counts_row = [None] * (column_size + 1)

            def parse_row(row):
                column_index = distinct_col2[clean_element(row[1])]
                counts_row[int(column_index + 1)] = int(row[2])

            rows.foreach(parse_row)
            # the value of col1 is the first value, the rest are the counts
            counts_row[0] = clean_element(col1Item)
            return Row(counts_row)
def test_column_stat_helper():
    """
    Expected quantile values come from use of org.apache.spark.sql.catalyst.util.QuantileSummaries
    """
    schema = StructType([StructField('value', IntegerType())])
    helper = ColumnStatHelper(col('value'))
    for i in range(1, 100001):
        helper.merge(Row(value=i), schema)
    helper.finalize()
    assert helper.count == 100000
    assert helper.min == 1
    assert helper.max == 100000
    assert helper.mean == 50000.5
    assert helper.stddev == 28867.65779668774  # sample standard deviation
    assert helper.get_quantile(0) == 1
    assert helper.get_quantile(0.25) == 24998
    assert helper.get_quantile(0.5) == 50000
    assert helper.get_quantile(0.75) == 74993
    assert helper.get_quantile(1) == 100000
예제 #7
0
 def test_session_range(self):
     df = self.spark.range(3)
     self.assertEqual(df.count(), 3)
     self.assertListEqual(df.collect(), [Row(id=0), Row(id=1), Row(id=2)])
     self.assertEqual(list(df.toLocalIterator()), [Row(id=0), Row(id=1), Row(id=2)])
예제 #8
0
 def test_session_create_data_frame_from_list_with_schema(self):
     schema = StructType([StructField('map', MapType(StringType(), IntegerType()), True)])
     df = self.spark.createDataFrame([({'a': 1},)], schema=schema)
     self.assertEqual(df.count(), 1)
     self.assertListEqual(df.collect(), [Row(map={'a': 1})])
     self.assertEqual(df.schema, schema)