#### to smoothen the two-dimensional histogram of offers. First MA is calculated #### #### for commissions partitioned by interest values. In second step MA is calculated #### #### over the resulting column partitioned by commission values #### ########################################################################################## #from time import * #optStart = clock() # Initial parameters abstol = 0.005 # Precision of price optimization rnd = 3 # Number of significant digits #bandwidth = 50 # How many rows in forward and backward to take for moving average # Create net of all possible interest rates, rounded to desired abstol # value 'key0' is a dummy variable to make cartesian product NIR = hc.range(int(minNIR / abstol), int((maxNIR + abstol) / abstol), 1, 1) NIR = NIR.withColumn("Interest", psf.round(NIR.id * abstol, rnd)).withColumn( "key", psf.lit("key0")).select('key', 'Interest') # Create net of all possible commission percentages, rounded to desired abstol # value 'key0' is a dummy variable to make cartesian product Com = hc.range(int(minCom / abstol), int((maxCom + abstol) / abstol), 1, 1) Com = Com.withColumn("CommissionPct", psf.round(Com.id * abstol, rnd)).withColumn("key", psf.lit("key0")).select( 'key', 'CommissionPct') ######################################################################## ### New KDE kdNIR = KernelDensity() kdCom = KernelDensity()
def sql_hive_context_example(spark): # create hive context object. hive_ctx = HiveContext(spark.sparkContext) # createDataFrame l = [('Alice', 18), ('Bob', 20), ('Charley', 22)] df = hive_ctx.createDataFrame(l, ('name', 'age')) print("createDataFrame API finished") # registerDataFrameAsTable hive_ctx.registerDataFrameAsTable(df, "table1") print("registerDataFrameAsTable API finished") # sql tmp_df = hive_ctx.sql("select * from table1") tmp_df.show() print("sql API finished") # table tmp_df = hive_ctx.table("table1") tmp_df.show() print("table API finished") # tableNames table_names = hive_ctx.tableNames() print(table_names) print("tableNames API finished") # tables tables = hive_ctx.tables() print(tables) print("tables API finished") # range tmp_df = hive_ctx.range(1,10,2) tmp_df.show() print("range API finished") # dropTempTable hive_ctx.dropTempTable("table1") table_names = hive_ctx.tableNames() print(table_names) print("dropTempTable API finished") # cacheTable & uncacheTable & clearCache df = hive_ctx.range(1,10,2) hive_ctx.registerDataFrameAsTable(df, "table") hive_ctx.cacheTable("table") hive_ctx.uncacheTable("table") hive_ctx.clearCache() print("cacheTable & uncacheTable & clearCache API finished") # createExternalTable # newSession # registerFunction # Deprecated in 2.3.0. Use :func:`spark.udf.register` instead # registerJavaFunction # Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead # setConf & getConf hive_ctx.setConf("key1", "value1") value = hive_ctx.getConf("key1") print(value) print("setConf & getConf API finished") # refreshTable # Exception: An error occurred while calling o26.refreshTable: # Method refreshTable([class java.lang.String]) does not exist print("Finish running HiveContext API")
output_path = "s3://emr-rwes-pa-spark-dev-datastore/BI_IPF_2016/02_results/" start_time = time.time() st = datetime.datetime.fromtimestamp(start_time).strftime('%Y%m%d_%H%M%S') table_name = "hive_test_" + st datafactz_table_name = "hive_test_datafactz_" + st pos = sqlContext.read.load((data_path + pos_file), format='com.databricks.spark.csv', header='true', inferSchema='true') neg = sqlContext.read.load((data_path + neg_file), format='com.databricks.spark.csv', header='true', inferSchema='true') dataColumns = pos.columns data = pos.select(dataColumns).unionAll(neg.select(dataColumns)) #for IMS data.write.save(path=output_path + table_name, format='orc') #for datafactz df = sqlContext.range(0, numRowsReq) datafactz_df = df.select(rand().alias("Col1"), rand().alias("Col2"), rand().alias("Col3")) datafactz_df.write.save(path=output_path + datafactz_table_name, format='orc')