예제 #1
0
 def merge(self, row, schema):
     row_value = self.column.eval(row, schema)
     if self.ref_value is None:
         ref_value = Row(*row_value)
         ref_value.__fields__ = schema.names
         self.ref_value = ref_value
     self.items.append(self.writer.preformat(row_value, schema))
예제 #2
0
 def test_write_nested_rows_to_json(self):
     df = spark.createDataFrame([
         Row(
             age=2,
             name='Alice',
             animals=[
                 Row(name='Chessur', type='cat'),
                 Row(name='The White Rabbit', type='Rabbit')
             ],
         ),
         Row(age=5, name='Bob', animals=[]),
     ])
     df.write.json('.tmp/wonderland/')
     self.assertDictEqual(
         get_folder_content('.tmp/wonderland'),
         {
             '_SUCCESS': [],
             'part-00000-2819354714706678872.json': [
                 '{"age":2,"animals":['
                 '{"name":"Chessur","type":"cat"},'
                 '{"name":"The White Rabbit","type":"Rabbit"}'
                 '],"name":"Alice"}\n',
                 '{"age":5,"animals":[],"name":"Bob"}\n',
             ],
         },
     )
예제 #3
0
 def test_write_to_csv_fail_when_overwrite(self):
     df = spark.createDataFrame(
         [Row(age=2, name='Alice'),
          Row(age=5, name='Bob')])
     df.write.csv('.tmp/wonderland/')
     with self.assertRaises(AnalysisException) as ctx:
         df.write.csv('.tmp/wonderland/')
     self.assertEqual(ctx.exception.args[0],
                      'path .tmp/wonderland already exists.;')
     self.assertDictEqual(
         get_folder_content('.tmp/wonderland'),
         {
             '_SUCCESS': [],
             'part-00000-3434325560268771971.csv': ['2,Alice\n', '5,Bob\n']
         },
     )
예제 #4
0
 def test_write_to_csv_with_custom_options(self):
     df = spark.createDataFrame([
         Row(age=2, name='Alice', occupation=None),
         Row(age=5, name='Bob', occupation='')
     ])
     df.write.csv('.tmp/wonderland/',
                  sep='^',
                  emptyValue='',
                  nullValue='null',
                  header=True)
     self.assertDictEqual(
         get_folder_content('.tmp/wonderland'),
         {
             '_SUCCESS': [],
             'part-00000-4061950540148431296.csv':
             ['age^name^occupation\n', '2^Alice^null\n', '5^Bob^\n'],
         },
     )
예제 #5
0
 def test_write_to_csv(self):
     df = spark.createDataFrame([
         Row(
             age=2,
             name='Alice',
             time=datetime.datetime(2017, 1, 1, tzinfo=tzlocal()),
         ),
         Row(
             age=5,
             name='Bob',
             time=datetime.datetime(2014, 3, 2, tzinfo=tzlocal()),
         ),
     ])
     df.write.csv('.tmp/wonderland/')
     self.assertDictEqual(
         get_folder_content('.tmp/wonderland'),
         {
             '_SUCCESS': [],
             'part-00000-8447389540241120843.csv': [
                 '2,Alice,2017-01-01T00:00:00.000+01:00\n',
                 '5,Bob,2014-03-02T00:00:00.000+01:00\n',
             ],
         },
     )
예제 #6
0
 def test_write_to_json(self):
     df = spark.createDataFrame([
         Row(
             age=2,
             name='Alice',
             time=datetime.datetime(2017, 1, 1, tzinfo=tzlocal()),
         ),
         Row(
             age=5,
             name='Bob',
             time=datetime.datetime(2014, 3, 2, tzinfo=tzlocal()),
         ),
     ])
     df.write.json('.tmp/wonderland/')
     self.assertDictEqual(
         get_folder_content('.tmp/wonderland'),
         {
             '_SUCCESS': [],
             'part-00000-8447389540241120843.json': [
                 '{"age":2,"name":"Alice","time":"2017-01-01T00:00:00.000+01:00"}\n',
                 '{"age":5,"name":"Bob","time":"2014-03-02T00:00:00.000+01:00"}\n',
             ],
         },
     )
예제 #7
0
 def test_csv_read_with_inferred_schema(self):
     df = spark.read.option('inferSchema', True).csv(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      'data/fundings/'),
         header=True,
     )
     self.assertEqual(df.count(), 4)
     self.assertEqual(
         df.schema,
         StructType([
             StructField('permalink', StringType()),
             StructField('company', StringType()),
             StructField('numEmps', IntegerType()),
             StructField('category', StringType()),
             StructField('city', StringType()),
             StructField('state', StringType()),
             StructField('fundedDate', TimestampType()),
             StructField('raisedAmt', IntegerType()),
             StructField('raisedCurrency', StringType()),
             StructField('round', StringType()),
         ]),
     )
     self.assertEqual(
         [Row(**r.asDict()) for r in df.collect()],
         [
             Row(
                 permalink='mycityfaces',
                 company='MyCityFaces',
                 numEmps=7,
                 category='web',
                 city='Scottsdale',
                 state='AZ',
                 fundedDate=datetime.datetime(2008, 1, 1, 0, 0),
                 raisedAmt=50000,
                 raisedCurrency='USD',
                 round='seed',
             ),
             Row(
                 permalink='flypaper',
                 company='Flypaper',
                 numEmps=None,
                 category='web',
                 city='Phoenix',
                 state='AZ',
                 fundedDate=datetime.datetime(2008, 2, 1, 0, 0),
                 raisedAmt=3000000,
                 raisedCurrency='USD',
                 round='a',
             ),
             Row(
                 permalink='chosenlist-com',
                 company='ChosenList.com',
                 numEmps=5,
                 category='web',
                 city='Scottsdale',
                 state='AZ',
                 fundedDate=datetime.datetime(2008, 1, 25, 0, 0),
                 raisedAmt=233750,
                 raisedCurrency='USD',
                 round='angel',
             ),
             Row(
                 permalink='digg',
                 company='Digg',
                 numEmps=60,
                 category='web',
                 city='San Francisco',
                 state='CA',
                 fundedDate=datetime.datetime(2006, 12, 1, 0, 0),
                 raisedAmt=8500000,
                 raisedCurrency='USD',
                 round='b',
             ),
         ],
     )
예제 #8
0
 def test_csv_read_without_schema(self):
     df = spark.read.csv(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      'data/fundings/'),
         header=True,
     )
     self.assertEqual(df.count(), 4)
     self.assertEqual(
         df.schema,
         StructType([
             StructField('permalink', StringType()),
             StructField('company', StringType()),
             StructField('numEmps', StringType()),
             StructField('category', StringType()),
             StructField('city', StringType()),
             StructField('state', StringType()),
             StructField('fundedDate', StringType()),
             StructField('raisedAmt', StringType()),
             StructField('raisedCurrency', StringType()),
             StructField('round', StringType()),
         ]),
     )
     self.assertListEqual(
         [Row(**r.asDict()) for r in df.collect()],
         [
             Row(
                 permalink='mycityfaces',
                 company='MyCityFaces',
                 numEmps='7',
                 category='web',
                 city='Scottsdale',
                 state='AZ',
                 fundedDate='2008-01-01',
                 raisedAmt='50000',
                 raisedCurrency='USD',
                 round='seed',
             ),
             Row(
                 permalink='flypaper',
                 company='Flypaper',
                 numEmps=None,
                 category='web',
                 city='Phoenix',
                 state='AZ',
                 fundedDate='2008-02-01',
                 raisedAmt='3000000',
                 raisedCurrency='USD',
                 round='a',
             ),
             Row(
                 permalink='chosenlist-com',
                 company='ChosenList.com',
                 numEmps='5',
                 category='web',
                 city='Scottsdale',
                 state='AZ',
                 fundedDate='2008-01-25',
                 raisedAmt='233750',
                 raisedCurrency='USD',
                 round='angel',
             ),
             Row(
                 permalink='digg',
                 company='Digg',
                 numEmps='60',
                 category='web',
                 city='San Francisco',
                 state='CA',
                 fundedDate='2006-12-01',
                 raisedAmt='8500000',
                 raisedCurrency='USD',
                 round='b',
             ),
         ],
     )