示例#1
0
    def _parse_from_list(self, input_rows, schema=None):
        # if schema:
        #     parsed_schema = self._parse_schema(schema)
        # else:
        #     first = input_rows[0]
        #     if not first:
        #         raise Exception('Cannot parse schema from blank data')
        #     if isinstance(first, dict):
        #         parsed_schema = self._infer_python_schema_from_dict(first)
        #     elif isinstance(first, Row):
        #         parsed_schema = self._infer_python_schema_from_row(first)
        #     else:
        #         raise Exception('Schema can only be parsed from dict or Row')

        rows = []

        for r in input_rows:
            if isinstance(r, dict):
                # assert len(r) == len(parsed_schema)
                # assert sorted(r.keys()) == sorted(parsed_schema.keys())
                # # TODO validate input types against schema?

                rows.append(Row(**r))
            elif isinstance(r, list) or isinstance(r, tuple):
                if not schema:
                    raise Exception("Schema required when creating DataFrame from list of list/tuple")
                # assert len(r) == len(parsed_schema)
                # # TODO validate input types against schema?
                # TODO this won't deal with nested Rows
                keys = [t.name for t in schema.fields]
                inputs = {}
                for idx, k in enumerate(keys):
                    inputs[k] = r[idx]
                rows.append(Row(**inputs))
            else:
                raise Exception("input rows must of type dict, list or tuple")

        return rows, schema
示例#2
0
    def test_creation_from_rdd_of_rows(self):
        rdd = self.sc.parallelize([Row(a='a', b=123), Row(a='aa', b=456)])

        df = DataFrame(rdd)

        self.assertEqual(df.count(), 2)
示例#3
0
    def test_row_attributes(self):
        r = Row(a='a', b=123)

        self.assertEqual(r.a, 'a')
        self.assertEqual(r.b, 123)
示例#4
0
    def test_nested_row_asdict(self):
        r = Row(a=Row(nested_a='a'), b=123)

        self.assertEqual(r.asDict(), {'a': Row(nested_a='a'), 'b': 123})
示例#5
0
    def test_nested_row_asdict_recursive(self):
        r = Row(a=Row(nested_a='a'), b=123)

        self.assertEqual(r.asDict(True), {'a': {'nested_a': 'a'}, 'b': 123})
示例#6
0
    def test_small_row_asdict_recursive(self):
        r = Row(a='a', b=123)

        self.assertEqual(r.asDict(True), {'a': 'a', 'b': 123})
示例#7
0
    def test_small_row_asdict(self):
        r = Row(a='a', b=123)

        self.assertEqual(r.asDict(), {'a': 'a', 'b': 123})
示例#8
0
    def test_access_non_existent_key_raises_value_error(self):
        row = Row()

        with self.assertRaises(ValueError) as e:
            foo = row['foo']
        self.assertEqual(ValueError, type(e.exception))
示例#9
0
    def test_access_non_existent_attr_raises_attr_error(self):
        row = Row()

        with self.assertRaises(AttributeError) as e:
            foo = row.foo
        self.assertEqual(AttributeError, type(e.exception))
示例#10
0
    def test_row_dict_style_access(self):
        r = Row(a='a', b=123)

        self.assertEqual(r['a'], 'a')
        self.assertEqual(r['b'], 123)