Пример #1
0
    def test_session_create_data_frame_from_pandas_data_frame(self):
        try:
            # Pandas is an optional dependency
            # pylint: disable=import-outside-toplevel
            import pandas as pd
        except ImportError as e:
            raise ImportError("pandas is not importable") from e

        pdf = pd.DataFrame([(1, "one"), (2, "two"), (3, "three")])

        df = self.spark.createDataFrame(pdf)

        self.assertEqual(df.count(), 3)
        self.assertListEqual(df.collect(), [
            Row(**{
                "0": 1,
                "1": 'one'
            }),
            Row(**{
                "0": 2,
                "1": 'two'
            }),
            Row(**{
                "0": 3,
                "2": 'three'
            })
        ])
        self.assertEqual(
            df.schema,
            StructType([
                StructField("0", LongType(), True),
                StructField("1", StringType(), True)
            ]))
Пример #2
0
 def test_session_create_data_frame_from_list_with_schema(self):
     schema = StructType(
         [StructField("map", MapType(StringType(), IntegerType()), True)])
     df = self.spark.createDataFrame([({'a': 1}, )], schema=schema)
     self.assertEqual(df.count(), 1)
     self.assertListEqual(df.collect(), [Row(map={'a': 1})])
     self.assertEqual(df.schema, schema)
Пример #3
0
 def test_session_create_data_frame_from_list(self):
     df = self.spark.createDataFrame([
         (1, "one"),
         (2, "two"),
         (3, "three"),
     ])
     self.assertEqual(df.count(), 3)
     self.assertListEqual(
         df.collect(),
         [Row(_1=1, _2='one'),
          Row(_1=2, _2='two'),
          Row(_1=3, _2='three')])
     self.assertEqual(
         df.schema,
         StructType([StructField("_1", LongType(), True), StructField("_2", StringType(), True)])
     )
Пример #4
0
        def create_counts_row(col1Item, rows):
            counts_row = [None] * (column_size + 1)

            def parse_row(row):
                column_index = distinct_col2[clean_element(row[1])]
                counts_row[int(column_index + 1)] = int(row[2])

            rows.foreach(parse_row)
            # the value of col1 is the first value, the rest are the counts
            counts_row[0] = clean_element(col1Item)
            return Row(counts_row)
Пример #5
0
 def test_cast_to_struct(self):
     self.assertEqual(
         cast_to_struct(Row(character='Alice',
                            day='28',
                            month='8',
                            year='2019'),
                        from_type=StructType(fields=[
                            StructField("character", StringType()),
                            StructField("day", StringType()),
                            StructField("month", StringType()),
                            StructField("year", StringType()),
                        ]),
                        to_type=StructType(fields=[
                            StructField("character", StringType()),
                            StructField("day", IntegerType()),
                            StructField("month", IntegerType()),
                            StructField("year", IntegerType()),
                        ]),
                        options=BASE_OPTIONS),
         Row(character='Alice', day=28, month=8, year=2019),
     )
Пример #6
0
 def test_session_range(self):
     df = self.spark.range(3)
     self.assertEqual(df.count(), 3)
     self.assertListEqual(df.collect(), [Row(id=0), Row(id=1), Row(id=2)])
     self.assertEqual(
         list(df.toLocalIterator()),
         [Row(id=0), Row(id=1), Row(id=2)])
Пример #7
0
def test_column_stat_helper():
    """
    Expected quantile values come from use of org.apache.spark.sql.catalyst.util.QuantileSummaries
    """
    schema = StructType([StructField("value", IntegerType())])
    helper = ColumnStatHelper(col("value"))
    for i in range(1, 100001):
        helper.merge(Row(value=i), schema)
    helper.finalize()
    assert helper.count == 100000
    assert helper.min == 1
    assert helper.max == 100000
    assert helper.mean == 50000.5
    assert helper.stddev == 28867.65779668774  # sample standard deviation
    assert helper.get_quantile(0) == 1
    assert helper.get_quantile(0.25) == 24998
    assert helper.get_quantile(0.5) == 50000
    assert helper.get_quantile(0.75) == 74993
    assert helper.get_quantile(1) == 100000
Пример #8
0
 def test_cast_row_to_string(self):
     self.assertEqual(
         cast_to_string(Row(a=collections.OrderedDict([("value", None),
                                                       ("b", {
                                                           "c": 7
                                                       })]),
                            b=None,
                            c=True,
                            d=5.2),
                        StructType([
                            StructField(
                                "a",
                                MapType(
                                    StringType(),
                                    MapType(StringType(), LongType(), True),
                                    True), True),
                            StructField("b", LongType(), True),
                            StructField("c", BooleanType(), True),
                            StructField("d", DoubleType(), True)
                        ]),
                        options=BASE_OPTIONS),
         "[[value ->, b -> [c -> 7]],, true, 5.2]")