Пример #1
0
from __future__ import unicode_literals
from ddf import DDFManager, DDF_HOME

dm = DDFManager('spark')

dm.sql('set hive.metastore.warehouse.dir=/tmp/hive/warehouse', False)
dm.sql('drop table if exists mtcars', False)
dm.sql(
    "CREATE TABLE mtcars (mpg double, cyl int, disp double, hp int, drat double, wt double,"
    " qesc double, vs int, am int, gear int, carb string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '",
    False)
dm.sql(
    "LOAD DATA LOCAL INPATH '" + DDF_HOME +
    "/resources/test/mtcars' INTO TABLE mtcars", False)

ddf = dm.sql2ddf('select * from mtcars', False)

print('Columns: ' + ', '.join(ddf.colnames))

print('Number of columns: {}'.format(ddf.cols))
print('Number of rows: {}'.format(ddf.rows))

print(ddf.summary())

print(ddf.head(2))

print(ddf.aggregate(['sum(mpg)', 'min(hp)'], ['vs', 'am']))

print(ddf.five_nums())

print(ddf.sample(3))
Пример #2
0
from __future__ import unicode_literals
from ddf import DDFManager, DDF_HOME, ml


dm = DDFManager("flink")

dm.sql('DROP TABLE IF EXISTS mtcars', False)
dm.sql("CREATE TABLE mtcars (mpg double, cyl int, disp double, hp int, drat double, wt double, "
       "qesc double, vs int, am int, gear int, carb string)", False)

dm.sql("LOAD {}/resources/test/mtcars delimited by ' ' INTO mtcars".format(DDF_HOME), False)

dm.sql("select count(*) from mtcars", False)
ddf = dm.sql2ddf("select * from mtcars", False)

print('Columns: ' + ', '.join(ddf.colnames))

print('Number of columns: {}'.format(ddf.cols))
print('Number of rows: {}'.format(ddf.rows))

print(ddf.summary())

print(ddf.head(2))

print(ddf.aggregate(['sum(mpg)', 'min(hp)'], ['vs', 'am']))

print(ddf.five_nums())

print(ddf.sample(3))

# Kmeans
Пример #3
0
from __future__ import unicode_literals
from ddf import DDFManager, DDF_HOME

dm = DDFManager('spark')

dm.sql('set hive.metastore.warehouse.dir=/tmp/hive/warehouse', False)
dm.sql('drop table if exists mtcars', False)
dm.sql("CREATE TABLE mtcars (mpg double, cyl int, disp double, hp int, drat double, wt double,"
       " qesc double, vs int, am int, gear int, carb string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '", False)
dm.sql("LOAD DATA LOCAL INPATH '" + DDF_HOME + "/resources/test/mtcars' INTO TABLE mtcars", False)

ddf = dm.sql2ddf('select * from mtcars', False)

print('Columns: ' + ', '.join(ddf.colnames))

print('Number of columns: {}'.format(ddf.cols))
print('Number of rows: {}'.format(ddf.rows))

print(ddf.summary())

print(ddf.head(2))

print(ddf.aggregate(['sum(mpg)', 'min(hp)'], ['vs', 'am']))

print(ddf.five_nums())

print(ddf.sample(3))

dm.shutdown()

Пример #4
0
 def setUp(self):
     self.dm = DDFManager('spark')
Пример #5
0
class TestDDFManager(unittest.TestCase):
    def setUp(self):
        self.dm = DDFManager('spark')

    def tearDown(self):
        self.dm.shutdown()

    def testSql(self):
        self.dm.sql('set hive.metastore.warehouse.dir=/tmp')
        self.dm.sql('drop table if exists airline_na')
        self.dm.sql(
            """create table airline_na (Year int,Month int,DayofMonth int,
             DayOfWeek int,DepTime int,CRSDepTime int,ArrTime int,
             CRSArrTime int,UniqueCarrier string, FlightNum int,
             TailNum string, ActualElapsedTime int, CRSElapsedTime int,
             AirTime int, ArrDelay int, DepDelay int, Origin string,
             Dest string, Distance int, TaxiIn int, TaxiOut int, Cancelled int,
             CancellationCode string, Diverted string, CarrierDelay int,
             WeatherDelay int, NASDelay int, SecurityDelay int, LateAircraftDelay int )
             ROW FORMAT DELIMITED FIELDS TERMINATED BY ','        
        """)
        self.dm.sql(
            "load data local inpath '{}/resources/test/airlineWithNA.csv' "
            "into table airline_na".format(DDF_HOME))

        ddf = self.dm.sql2ddf('select * from airline_na')
        self.assertEqual(ddf.rows, 31)
        self.assertEqual(ddf.cols, 29)
Пример #6
0
class TestDDFManager(unittest.TestCase):

    def setUp(self):
        self.dm = DDFManager('spark')

    def tearDown(self):
        self.dm.shutdown()

    def testSql(self):
        self.dm.sql('set hive.metastore.warehouse.dir=/tmp')
        self.dm.sql('drop table if exists airline_na')
        self.dm.sql("""create table airline_na (Year int,Month int,DayofMonth int,
             DayOfWeek int,DepTime int,CRSDepTime int,ArrTime int,
             CRSArrTime int,UniqueCarrier string, FlightNum int,
             TailNum string, ActualElapsedTime int, CRSElapsedTime int,
             AirTime int, ArrDelay int, DepDelay int, Origin string,
             Dest string, Distance int, TaxiIn int, TaxiOut int, Cancelled int,
             CancellationCode string, Diverted string, CarrierDelay int,
             WeatherDelay int, NASDelay int, SecurityDelay int, LateAircraftDelay int )
             ROW FORMAT DELIMITED FIELDS TERMINATED BY ','        
        """)
        self.dm.sql("load data local inpath '{}/resources/test/airlineWithNA.csv' "
                    "into table airline_na".format(DDF_HOME))
        
        ddf = self.dm.sql2ddf('select * from airline_na')
        self.assertEqual(ddf.rows, 31)
        self.assertEqual(ddf.cols, 29)
Пример #7
0
 def setUp(self):
     self.dm = DDFManager('spark')
Пример #8
0
 def setUpClass(cls):
     cls.dm_spark = DDFManager('spark')
     cls.airlines = cls.loadAirlines(cls.dm_spark)
     cls.mtcars = cls.loadMtCars(cls.dm_spark)