示例#1
0
 def insert_table(self):
     sqlcontext = SnappyContext(self.sc)
     newrow = [1L, 2L, 3L], [2L, 3L, 4L]
     sqlcontext.insert(SnappyContextTests.tablename, newrow)
     self.verify_table_rows(7)
     newrow = [1L, 2L, 3L]
     sqlcontext.insert(SnappyContextTests.tablename , newrow)
     self.verify_table_rows(8)
示例#2
0
 def create_table_using_datasource(self, provider, schemaddl=False):
     sqlcontext = SnappyContext(self.sc)
     df = sqlcontext._sc.parallelize(SnappyContextTests.testdata, 5).toDF(["COL1", "COL2", "COL3"])
     if schemaddl is False:
         sqlcontext.createTable(SnappyContextTests.tablename, provider, df.schema)
     else:
         sqlcontext.createTable(SnappyContextTests.tablename, provider, "(COL1 INT , COL2 INT , COL3 INT)")
     df.write.format("row").mode("append").saveAsTable(SnappyContextTests.tablename)
示例#3
0
    def test_new_session(self):
        sqlcontext1 = SnappyContext(self.sc)
        sqlcontext1.setConf("test_key", "a")

        sqlcontext2 = sqlcontext1.newSession()
        sqlcontext2.setConf("test_key", "b")

        self.assertEqual(sqlcontext1.getConf("test_key", ""), "a")
        self.assertEqual(sqlcontext2.getConf("test_key", ""), "b")
示例#4
0
    def __init__(self, sparkContext, jsparkSession=None):
        """Creates a new SnappySession.
        """
        self._sc = sparkContext
        self._jsc = self._sc._jsc
        self._jvm = self._sc._jvm
        SparkSession.__init__(self, sparkContext)
        if jsparkSession is None:
            jsparkSession = self._jvm.SnappySession(self._jsc.sc())

        from pyspark.sql.snappy import SnappyContext
        self._wrapped = SnappyContext(self._sc, jsparkSession)
        self._jsparkSession = jsparkSession
示例#5
0
    def __init__(self, sparkContext, batchDuration=None, jssc=None):
        """
        Create a new StreamingContext.

        @param sparkContext: L{SparkContext} object.
        @param batchDuration: the time interval (in seconds) at which streaming
                              data will be divided into batches
        """

        self._sc = sparkContext
        self._jvm = self._sc._jvm
        self._jssc = jssc or self._initialize_context(self._sc, batchDuration)
        self._snappycontext = SnappyContext(sparkContext)
示例#6
0
    def test_schema_dstream(self):
        rdd = [
            self.sc.parallelize([(127, -128, -32768, 32767, 2147483647, 1.0,
                                  date(2010, 1,
                                       1), datetime(2010, 1, 1, 1, 1, 1), {
                                           "a": 1
                                       }, (2, ), [1, 2, 3], None)])
        ]
        schema = StructType([
            StructField("byte1", ByteType(), False),
            StructField("byte2", ByteType(), False),
            StructField("short1", ShortType(), False),
            StructField("short2", ShortType(), False),
            StructField("int1", IntegerType(), False),
            StructField("float1", FloatType(), False),
            StructField("date1", DateType(), False),
            StructField("time1", TimestampType(), False),
            StructField("map1", MapType(StringType(), IntegerType(), False),
                        False),
            StructField("struct1",
                        StructType([StructField("b", ShortType(), False)]),
                        False),
            StructField("list1", ArrayType(ByteType(), False), False),
            StructField("null1", DoubleType(), True)
        ])

        dstream = self.ssc.queueStream(rdd)
        self.ssc.sql("drop  table if exists testTable")

        self.ssc._snappycontext.createTable("testTable", "column", schema)

        schemdstream = self.ssc.createSchemaDStream(dstream, schema)

        def testFunction(df):
            df.write.format("column").mode("append").saveAsTable("testTable")

        schemdstream.foreachDataFrame(lambda df: testFunction(df))

        self.ssc.sql("select count (*)  from testTable").collect()
        self.ssc.start()
        self.ssc.awaitTermination(2)
        result = SnappyContext(
            self.sc).sql("select count(*) from testTable").collect()
        self.assertEqual(result[0][0], 1)
示例#7
0
    def __init__(self, jdstream, ssc, jrdd_deserializer, schema):
        DStream.__init__(self, jdstream, ssc, jrdd_deserializer)

        self._schema = schema
        self._sqlcontext = SnappyContext(self._sc)
示例#8
0
 def create_table_using_sql(self, ddl, provider):
     sqlcontext = SnappyContext(self.sc)
     dataDF = sqlcontext._sc.parallelize(SnappyContextTests.testdata, 5).toDF()
     sqlcontext.sql("DROP TABLE IF EXISTS " + SnappyContextTests.tablename)
     sqlcontext.sql(ddl)
     dataDF.write.format(provider).mode("append").saveAsTable(SnappyContextTests.tablename)
示例#9
0
 def drop_table(self, ifexists=False):
     sqlcontext = SnappyContext(self.sc)
     sqlcontext.dropTable(SnappyContextTests.tablename, ifexists)
示例#10
0
 def verify_table_rows(self, rowcount):
     sqlcontext = SnappyContext(self.sc)
     result = sqlcontext.sql("SELECT COUNT(*) FROM " + SnappyContextTests.tablename).collect()
     self.assertTrue(result[0][0] == rowcount)
示例#11
0
 def truncate_table(self):
     sqlcontext = SnappyContext(self.sc)
     sqlcontext.truncateTable(SnappyContextTests.tablename)
示例#12
0
 def update_table(self):
     sqlcontext = SnappyContext(self.sc)
     modifiedrows = sqlcontext.update(SnappyContextTests.tablename, "COL2 =2", [7L], ["COL1"])
     self.assertTrue(modifiedrows == 3)
示例#13
0
 def test_delete(self):
     self.drop_table(True)
     self.create_table_using_datasource("row")
     sqlcontext = SnappyContext(self.sc)
     self.assertTrue(sqlcontext.delete(SnappyContextTests.tablename, "col1=1"), 2)
     self.drop_table()
示例#14
0
    totalTimeCol = int(time.time() * 1000) - start
    print("Query time: %dms" % totalTimeCol)

    # Suppose a particular Airline company say 'Delta Air Lines Inc.'
    # re-brands itself as 'Delta America'.Update the row table.
    query = " CODE ='DL'"
    newColumnValues = ["Delta America Renewed"]
    sqlContext.update(ROW_TABLE_NAME, query, newColumnValues, ["DESCRIPTION"])

    # Data Frame query :Which Airlines Arrive On Schedule? JOIN with reference table
    colResultAftUpd = airlineDF.alias('airlineDF') \
        .join(airlineCodeDF.alias('airlineCodeDF'), col('airlineDF.UniqueCarrier') == col('airlineCodeDF.CODE')) \
        .groupBy(col('airlineDF.UniqueCarrier'), col('airlineCodeDF.Description')) \
        .agg({"ArrDelay": "avg"}). \
        orderBy("avg(ArrDelay)")

    print("Airline arrival schedule after Updated values:")

    startColUpd = int(time.time() * 1000)
    colResultAftUpd.show()
    totalTimeColUpd = int(time.time() * 1000) - startColUpd
    print("Query time:%dms" % totalTimeColUpd)


if __name__ == "__main__":
    # Configure Spark
    conf = SparkConf().setAppName(APP_NAME)
    sc = SparkContext(conf=conf)
    snc = SnappyContext(sc)
    main(snc)