def merge(self, row, schema): row_value = self.column.eval(row, schema) if self.ref_value is None: ref_value = Row(*row_value) ref_value.__fields__ = schema.names self.ref_value = ref_value self.items.append(self.writer.preformat(row_value, schema))
def test_write_nested_rows_to_json(self): df = spark.createDataFrame([ Row( age=2, name='Alice', animals=[ Row(name='Chessur', type='cat'), Row(name='The White Rabbit', type='Rabbit') ], ), Row(age=5, name='Bob', animals=[]), ]) df.write.json('.tmp/wonderland/') self.assertDictEqual( get_folder_content('.tmp/wonderland'), { '_SUCCESS': [], 'part-00000-2819354714706678872.json': [ '{"age":2,"animals":[' '{"name":"Chessur","type":"cat"},' '{"name":"The White Rabbit","type":"Rabbit"}' '],"name":"Alice"}\n', '{"age":5,"animals":[],"name":"Bob"}\n', ], }, )
def test_write_to_csv_fail_when_overwrite(self): df = spark.createDataFrame( [Row(age=2, name='Alice'), Row(age=5, name='Bob')]) df.write.csv('.tmp/wonderland/') with self.assertRaises(AnalysisException) as ctx: df.write.csv('.tmp/wonderland/') self.assertEqual(ctx.exception.args[0], 'path .tmp/wonderland already exists.;') self.assertDictEqual( get_folder_content('.tmp/wonderland'), { '_SUCCESS': [], 'part-00000-3434325560268771971.csv': ['2,Alice\n', '5,Bob\n'] }, )
def test_write_to_csv_with_custom_options(self): df = spark.createDataFrame([ Row(age=2, name='Alice', occupation=None), Row(age=5, name='Bob', occupation='') ]) df.write.csv('.tmp/wonderland/', sep='^', emptyValue='', nullValue='null', header=True) self.assertDictEqual( get_folder_content('.tmp/wonderland'), { '_SUCCESS': [], 'part-00000-4061950540148431296.csv': ['age^name^occupation\n', '2^Alice^null\n', '5^Bob^\n'], }, )
def test_write_to_csv(self): df = spark.createDataFrame([ Row( age=2, name='Alice', time=datetime.datetime(2017, 1, 1, tzinfo=tzlocal()), ), Row( age=5, name='Bob', time=datetime.datetime(2014, 3, 2, tzinfo=tzlocal()), ), ]) df.write.csv('.tmp/wonderland/') self.assertDictEqual( get_folder_content('.tmp/wonderland'), { '_SUCCESS': [], 'part-00000-8447389540241120843.csv': [ '2,Alice,2017-01-01T00:00:00.000+01:00\n', '5,Bob,2014-03-02T00:00:00.000+01:00\n', ], }, )
def test_write_to_json(self): df = spark.createDataFrame([ Row( age=2, name='Alice', time=datetime.datetime(2017, 1, 1, tzinfo=tzlocal()), ), Row( age=5, name='Bob', time=datetime.datetime(2014, 3, 2, tzinfo=tzlocal()), ), ]) df.write.json('.tmp/wonderland/') self.assertDictEqual( get_folder_content('.tmp/wonderland'), { '_SUCCESS': [], 'part-00000-8447389540241120843.json': [ '{"age":2,"name":"Alice","time":"2017-01-01T00:00:00.000+01:00"}\n', '{"age":5,"name":"Bob","time":"2014-03-02T00:00:00.000+01:00"}\n', ], }, )
def test_csv_read_with_inferred_schema(self): df = spark.read.option('inferSchema', True).csv( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data/fundings/'), header=True, ) self.assertEqual(df.count(), 4) self.assertEqual( df.schema, StructType([ StructField('permalink', StringType()), StructField('company', StringType()), StructField('numEmps', IntegerType()), StructField('category', StringType()), StructField('city', StringType()), StructField('state', StringType()), StructField('fundedDate', TimestampType()), StructField('raisedAmt', IntegerType()), StructField('raisedCurrency', StringType()), StructField('round', StringType()), ]), ) self.assertEqual( [Row(**r.asDict()) for r in df.collect()], [ Row( permalink='mycityfaces', company='MyCityFaces', numEmps=7, category='web', city='Scottsdale', state='AZ', fundedDate=datetime.datetime(2008, 1, 1, 0, 0), raisedAmt=50000, raisedCurrency='USD', round='seed', ), Row( permalink='flypaper', company='Flypaper', numEmps=None, category='web', city='Phoenix', state='AZ', fundedDate=datetime.datetime(2008, 2, 1, 0, 0), raisedAmt=3000000, raisedCurrency='USD', round='a', ), Row( permalink='chosenlist-com', company='ChosenList.com', numEmps=5, category='web', city='Scottsdale', state='AZ', fundedDate=datetime.datetime(2008, 1, 25, 0, 0), raisedAmt=233750, raisedCurrency='USD', round='angel', ), Row( permalink='digg', company='Digg', numEmps=60, category='web', city='San Francisco', state='CA', fundedDate=datetime.datetime(2006, 12, 1, 0, 0), raisedAmt=8500000, raisedCurrency='USD', round='b', ), ], )
def test_csv_read_without_schema(self): df = spark.read.csv( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data/fundings/'), header=True, ) self.assertEqual(df.count(), 4) self.assertEqual( df.schema, StructType([ StructField('permalink', StringType()), StructField('company', StringType()), StructField('numEmps', StringType()), StructField('category', StringType()), StructField('city', StringType()), StructField('state', StringType()), StructField('fundedDate', StringType()), StructField('raisedAmt', StringType()), StructField('raisedCurrency', StringType()), StructField('round', StringType()), ]), ) self.assertListEqual( [Row(**r.asDict()) for r in df.collect()], [ Row( permalink='mycityfaces', company='MyCityFaces', numEmps='7', category='web', city='Scottsdale', state='AZ', fundedDate='2008-01-01', raisedAmt='50000', raisedCurrency='USD', round='seed', ), Row( permalink='flypaper', company='Flypaper', numEmps=None, category='web', city='Phoenix', state='AZ', fundedDate='2008-02-01', raisedAmt='3000000', raisedCurrency='USD', round='a', ), Row( permalink='chosenlist-com', company='ChosenList.com', numEmps='5', category='web', city='Scottsdale', state='AZ', fundedDate='2008-01-25', raisedAmt='233750', raisedCurrency='USD', round='angel', ), Row( permalink='digg', company='Digg', numEmps='60', category='web', city='San Francisco', state='CA', fundedDate='2006-12-01', raisedAmt='8500000', raisedCurrency='USD', round='b', ), ], )