####   to smoothen the two-dimensional histogram of offers. First MA is calculated    ####
####  for commissions partitioned by interest values. In second step MA is calculated ####
####            over the resulting column partitioned by commission values            ####
##########################################################################################

#from time import *
#optStart = clock()

# Initial parameters
abstol = 0.005  # Precision of price optimization
rnd = 3  # Number of significant digits
#bandwidth = 50      # How many rows in forward and backward to take for moving average

# Create net of all possible interest rates, rounded to desired abstol
# value 'key0' is a dummy variable to make cartesian product
NIR = hc.range(int(minNIR / abstol), int((maxNIR + abstol) / abstol), 1, 1)
NIR = NIR.withColumn("Interest", psf.round(NIR.id * abstol, rnd)).withColumn(
    "key", psf.lit("key0")).select('key', 'Interest')

# Create net of all possible commission percentages, rounded to desired abstol
# value 'key0' is a dummy variable to make cartesian product
Com = hc.range(int(minCom / abstol), int((maxCom + abstol) / abstol), 1, 1)
Com = Com.withColumn("CommissionPct",
                     psf.round(Com.id * abstol,
                               rnd)).withColumn("key", psf.lit("key0")).select(
                                   'key', 'CommissionPct')

########################################################################
### New KDE
kdNIR = KernelDensity()
kdCom = KernelDensity()
def sql_hive_context_example(spark):
    
    # create hive context object.
    hive_ctx = HiveContext(spark.sparkContext)

    # createDataFrame
    l = [('Alice', 18), ('Bob', 20), ('Charley', 22)]
    df = hive_ctx.createDataFrame(l, ('name', 'age'))
    print("createDataFrame API finished")

    # registerDataFrameAsTable 
    hive_ctx.registerDataFrameAsTable(df, "table1")
    print("registerDataFrameAsTable API finished")

    # sql
    tmp_df = hive_ctx.sql("select * from table1")
    tmp_df.show()
    print("sql API finished")

    # table
    tmp_df = hive_ctx.table("table1")
    tmp_df.show()
    print("table API finished")

    # tableNames
    table_names = hive_ctx.tableNames()
    print(table_names)
    print("tableNames API finished")

    # tables
    tables = hive_ctx.tables()
    print(tables)
    print("tables API finished")

    # range
    tmp_df = hive_ctx.range(1,10,2)
    tmp_df.show()
    print("range API finished")

    # dropTempTable
    hive_ctx.dropTempTable("table1")
    table_names = hive_ctx.tableNames()
    print(table_names)
    print("dropTempTable API finished")

    # cacheTable & uncacheTable & clearCache
    df = hive_ctx.range(1,10,2)
    hive_ctx.registerDataFrameAsTable(df, "table")
    hive_ctx.cacheTable("table")
    hive_ctx.uncacheTable("table")
    hive_ctx.clearCache()
    print("cacheTable & uncacheTable & clearCache API finished")

    # createExternalTable

    # newSession

    # registerFunction
    # Deprecated in 2.3.0. Use :func:`spark.udf.register` instead

    # registerJavaFunction
    # Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead

    # setConf & getConf
    hive_ctx.setConf("key1", "value1")
    value = hive_ctx.getConf("key1")
    print(value)
    print("setConf & getConf API finished")

    # refreshTable
    # Exception: An error occurred while calling o26.refreshTable:
    # Method refreshTable([class java.lang.String]) does not exist
    
    print("Finish running HiveContext API")
예제 #3
0
output_path = "s3://emr-rwes-pa-spark-dev-datastore/BI_IPF_2016/02_results/"
start_time = time.time()
st = datetime.datetime.fromtimestamp(start_time).strftime('%Y%m%d_%H%M%S')

table_name = "hive_test_" + st
datafactz_table_name = "hive_test_datafactz_" + st

pos = sqlContext.read.load((data_path + pos_file),
                           format='com.databricks.spark.csv',
                           header='true',
                           inferSchema='true')

neg = sqlContext.read.load((data_path + neg_file),
                           format='com.databricks.spark.csv',
                           header='true',
                           inferSchema='true')

dataColumns = pos.columns

data = pos.select(dataColumns).unionAll(neg.select(dataColumns))

#for IMS
data.write.save(path=output_path + table_name, format='orc')

#for datafactz
df = sqlContext.range(0, numRowsReq)
datafactz_df = df.select(rand().alias("Col1"),
                         rand().alias("Col2"),
                         rand().alias("Col3"))
datafactz_df.write.save(path=output_path + datafactz_table_name, format='orc')