def insert_table(self): sqlcontext = SnappyContext(self.sc) newrow = [1L, 2L, 3L], [2L, 3L, 4L] sqlcontext.insert(SnappyContextTests.tablename, newrow) self.verify_table_rows(7) newrow = [1L, 2L, 3L] sqlcontext.insert(SnappyContextTests.tablename , newrow) self.verify_table_rows(8)
def create_table_using_datasource(self, provider, schemaddl=False): sqlcontext = SnappyContext(self.sc) df = sqlcontext._sc.parallelize(SnappyContextTests.testdata, 5).toDF(["COL1", "COL2", "COL3"]) if schemaddl is False: sqlcontext.createTable(SnappyContextTests.tablename, provider, df.schema) else: sqlcontext.createTable(SnappyContextTests.tablename, provider, "(COL1 INT , COL2 INT , COL3 INT)") df.write.format("row").mode("append").saveAsTable(SnappyContextTests.tablename)
def test_new_session(self): sqlcontext1 = SnappyContext(self.sc) sqlcontext1.setConf("test_key", "a") sqlcontext2 = sqlcontext1.newSession() sqlcontext2.setConf("test_key", "b") self.assertEqual(sqlcontext1.getConf("test_key", ""), "a") self.assertEqual(sqlcontext2.getConf("test_key", ""), "b")
def __init__(self, sparkContext, jsparkSession=None): """Creates a new SnappySession. """ self._sc = sparkContext self._jsc = self._sc._jsc self._jvm = self._sc._jvm SparkSession.__init__(self, sparkContext) if jsparkSession is None: jsparkSession = self._jvm.SnappySession(self._jsc.sc()) from pyspark.sql.snappy import SnappyContext self._wrapped = SnappyContext(self._sc, jsparkSession) self._jsparkSession = jsparkSession
def __init__(self, sparkContext, batchDuration=None, jssc=None): """ Create a new StreamingContext. @param sparkContext: L{SparkContext} object. @param batchDuration: the time interval (in seconds) at which streaming data will be divided into batches """ self._sc = sparkContext self._jvm = self._sc._jvm self._jssc = jssc or self._initialize_context(self._sc, batchDuration) self._snappycontext = SnappyContext(sparkContext)
def test_schema_dstream(self): rdd = [ self.sc.parallelize([(127, -128, -32768, 32767, 2147483647, 1.0, date(2010, 1, 1), datetime(2010, 1, 1, 1, 1, 1), { "a": 1 }, (2, ), [1, 2, 3], None)]) ] schema = StructType([ StructField("byte1", ByteType(), False), StructField("byte2", ByteType(), False), StructField("short1", ShortType(), False), StructField("short2", ShortType(), False), StructField("int1", IntegerType(), False), StructField("float1", FloatType(), False), StructField("date1", DateType(), False), StructField("time1", TimestampType(), False), StructField("map1", MapType(StringType(), IntegerType(), False), False), StructField("struct1", StructType([StructField("b", ShortType(), False)]), False), StructField("list1", ArrayType(ByteType(), False), False), StructField("null1", DoubleType(), True) ]) dstream = self.ssc.queueStream(rdd) self.ssc.sql("drop table if exists testTable") self.ssc._snappycontext.createTable("testTable", "column", schema) schemdstream = self.ssc.createSchemaDStream(dstream, schema) def testFunction(df): df.write.format("column").mode("append").saveAsTable("testTable") schemdstream.foreachDataFrame(lambda df: testFunction(df)) self.ssc.sql("select count (*) from testTable").collect() self.ssc.start() self.ssc.awaitTermination(2) result = SnappyContext( self.sc).sql("select count(*) from testTable").collect() self.assertEqual(result[0][0], 1)
def __init__(self, jdstream, ssc, jrdd_deserializer, schema): DStream.__init__(self, jdstream, ssc, jrdd_deserializer) self._schema = schema self._sqlcontext = SnappyContext(self._sc)
def create_table_using_sql(self, ddl, provider): sqlcontext = SnappyContext(self.sc) dataDF = sqlcontext._sc.parallelize(SnappyContextTests.testdata, 5).toDF() sqlcontext.sql("DROP TABLE IF EXISTS " + SnappyContextTests.tablename) sqlcontext.sql(ddl) dataDF.write.format(provider).mode("append").saveAsTable(SnappyContextTests.tablename)
def drop_table(self, ifexists=False): sqlcontext = SnappyContext(self.sc) sqlcontext.dropTable(SnappyContextTests.tablename, ifexists)
def verify_table_rows(self, rowcount): sqlcontext = SnappyContext(self.sc) result = sqlcontext.sql("SELECT COUNT(*) FROM " + SnappyContextTests.tablename).collect() self.assertTrue(result[0][0] == rowcount)
def truncate_table(self): sqlcontext = SnappyContext(self.sc) sqlcontext.truncateTable(SnappyContextTests.tablename)
def update_table(self): sqlcontext = SnappyContext(self.sc) modifiedrows = sqlcontext.update(SnappyContextTests.tablename, "COL2 =2", [7L], ["COL1"]) self.assertTrue(modifiedrows == 3)
def test_delete(self): self.drop_table(True) self.create_table_using_datasource("row") sqlcontext = SnappyContext(self.sc) self.assertTrue(sqlcontext.delete(SnappyContextTests.tablename, "col1=1"), 2) self.drop_table()
totalTimeCol = int(time.time() * 1000) - start print("Query time: %dms" % totalTimeCol) # Suppose a particular Airline company say 'Delta Air Lines Inc.' # re-brands itself as 'Delta America'.Update the row table. query = " CODE ='DL'" newColumnValues = ["Delta America Renewed"] sqlContext.update(ROW_TABLE_NAME, query, newColumnValues, ["DESCRIPTION"]) # Data Frame query :Which Airlines Arrive On Schedule? JOIN with reference table colResultAftUpd = airlineDF.alias('airlineDF') \ .join(airlineCodeDF.alias('airlineCodeDF'), col('airlineDF.UniqueCarrier') == col('airlineCodeDF.CODE')) \ .groupBy(col('airlineDF.UniqueCarrier'), col('airlineCodeDF.Description')) \ .agg({"ArrDelay": "avg"}). \ orderBy("avg(ArrDelay)") print("Airline arrival schedule after Updated values:") startColUpd = int(time.time() * 1000) colResultAftUpd.show() totalTimeColUpd = int(time.time() * 1000) - startColUpd print("Query time:%dms" % totalTimeColUpd) if __name__ == "__main__": # Configure Spark conf = SparkConf().setAppName(APP_NAME) sc = SparkContext(conf=conf) snc = SnappyContext(sc) main(snc)