def _parse_from_list(self, input_rows, schema=None): # if schema: # parsed_schema = self._parse_schema(schema) # else: # first = input_rows[0] # if not first: # raise Exception('Cannot parse schema from blank data') # if isinstance(first, dict): # parsed_schema = self._infer_python_schema_from_dict(first) # elif isinstance(first, Row): # parsed_schema = self._infer_python_schema_from_row(first) # else: # raise Exception('Schema can only be parsed from dict or Row') rows = [] for r in input_rows: if isinstance(r, dict): # assert len(r) == len(parsed_schema) # assert sorted(r.keys()) == sorted(parsed_schema.keys()) # # TODO validate input types against schema? rows.append(Row(**r)) elif isinstance(r, list) or isinstance(r, tuple): if not schema: raise Exception("Schema required when creating DataFrame from list of list/tuple") # assert len(r) == len(parsed_schema) # # TODO validate input types against schema? # TODO this won't deal with nested Rows keys = [t.name for t in schema.fields] inputs = {} for idx, k in enumerate(keys): inputs[k] = r[idx] rows.append(Row(**inputs)) else: raise Exception("input rows must of type dict, list or tuple") return rows, schema
def test_creation_from_rdd_of_rows(self): rdd = self.sc.parallelize([Row(a='a', b=123), Row(a='aa', b=456)]) df = DataFrame(rdd) self.assertEqual(df.count(), 2)
def test_row_attributes(self): r = Row(a='a', b=123) self.assertEqual(r.a, 'a') self.assertEqual(r.b, 123)
def test_nested_row_asdict(self): r = Row(a=Row(nested_a='a'), b=123) self.assertEqual(r.asDict(), {'a': Row(nested_a='a'), 'b': 123})
def test_nested_row_asdict_recursive(self): r = Row(a=Row(nested_a='a'), b=123) self.assertEqual(r.asDict(True), {'a': {'nested_a': 'a'}, 'b': 123})
def test_small_row_asdict_recursive(self): r = Row(a='a', b=123) self.assertEqual(r.asDict(True), {'a': 'a', 'b': 123})
def test_small_row_asdict(self): r = Row(a='a', b=123) self.assertEqual(r.asDict(), {'a': 'a', 'b': 123})
def test_access_non_existent_key_raises_value_error(self): row = Row() with self.assertRaises(ValueError) as e: foo = row['foo'] self.assertEqual(ValueError, type(e.exception))
def test_access_non_existent_attr_raises_attr_error(self): row = Row() with self.assertRaises(AttributeError) as e: foo = row.foo self.assertEqual(AttributeError, type(e.exception))
def test_row_dict_style_access(self): r = Row(a='a', b=123) self.assertEqual(r['a'], 'a') self.assertEqual(r['b'], 123)