def create_context(): ''' Creates spark context Returns: SparkSession ''' conf = SparkConf() conf.set('spark.sql.shuffle.partitions', 100) conf.set('spark.sql.broadcastTimeout', 1200) conf.set('spark.shuffle.service.enabled', 'true') conf.set('spark.executor.cores', 2) conf.set('spark.executor.instances', 4) conf.set('spark.executor.memory', '2G') conf.set('spark.driver.cores', 2) conf.set('spark.driver.memory', '2G') spark = SparkSession.builder \ .appName('Recon') \ .config(conf=conf) \ .enableHiveSupport() \ .getOrCreate() hivecontext = HiveContext(spark.sparkContext) hivecontext.setConf('hive.exec.dynamic.partition', 'true') hivecontext.setConf('hive.exec.dynamic.partition.mode', 'nonstrict') return spark
def get_spark_test(): conf = SparkConf() sc = SparkContext("local[4]", appName="youzan-algrithm", conf=conf) sql_context = HiveContext(sc) sql_context.sql(""" use fex_test """) sql_context.setConf("spark.sql.shuffle.partitions", "1") return sc, sql_context
def get_spark(num =4 , cores =4 , mem = "32g"): conf = SparkConf() conf.set("spark.executor.instances", "%d"% num) conf.set("spark.executor.cores", "%d" % cores) conf.set("spark.executor.memory", "%s" % mem) sc = SparkContext(appName="youzan-algrithm", conf=conf) sql_context = HiveContext(sc) sql_context.sql(""" use fex """) sql_context.setConf("spark.sql.shuffle.partitions", "16") return sc, sql_context
def get_spark(num=4, cores=4, mem="32g"): conf = SparkConf() conf.set("spark.executor.instances", "%d" % num) conf.set("spark.executor.cores", "%d" % cores) conf.set("spark.executor.memory", "%s" % mem) sc = SparkContext(appName="youzan-algrithm", conf=conf) sql_context = HiveContext(sc) sql_context.sql(""" use fex """) sql_context.setConf("spark.sql.shuffle.partitions", "16") return sc, sql_context
import sys local_path = os.path.dirname(__file__) sys.path.append(local_path + "/../lib") sys.path.append(local_path + "/../") sys.path.append(local_path) from pyspark import SQLContext, SparkConf, HiveContext from pyspark import SparkContext from post2 import adj, rlt def run(sc, sql_context, is_hive): adj.main(sc, sql_context, is_hive=True) rlt.main(sc, sql_context, is_hive=True) if __name__ == "__main__": conf = SparkConf() conf.set("spark.executor.instances", "16") conf.set("spark.executor.cores", "16") conf.set("spark.executor.memory", "8g") sc = SparkContext(appName="bintrade.post2.post_run", master="yarn-client", conf=conf) sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "32") sqlContext.sql("use fex") run(sc, sqlContext, is_hive=True)
hiveCtx.sql(sql).registerTempTable("temp_table") insert_sql = """ insert overwrite table {table_name} partition(dt='{dt}') select * from temp_table """.format(table_name=table_name, dt=dt_str) print("insert_sql:\n" + insert_sql) hiveCtx.sql(insert_sql) if __name__ == "__main__": conf = SparkConf() sc = SparkContext(conf=conf, appName="sp-tfidf") sc.setLogLevel("WARN") hiveCtx = HiveContext(sc) hiveCtx.setConf('spark.shuffle.consolidateFiles', 'true') hiveCtx.setConf('spark.shuffle.memoryFraction', '0.4') hiveCtx.setConf('spark.sql.shuffle.partitions', '1000') if len(sys.argv) == 1: dt = datetime.datetime.now() + datetime.timedelta(-1) else: dt = datetime.datetime.strptime(sys.argv[1], "%Y%m%d").date() dt_str = dt.strftime("%Y-%m-%d") yest_dt = dt + datetime.timedelta(-30) yest_str = yest_dt.strftime("%Y-%m-%d") hiveCtx.sql("use app") create_table(hiveCtx) getQuery(hiveCtx)
# author [email protected] import os import sys local_path = os.path.dirname(__file__) sys.path.append(local_path + "/../lib") sys.path.append(local_path + "/../") sys.path.append(local_path) from pyspark import SQLContext, SparkConf, HiveContext from pyspark import SparkContext from ml import diff_feature_reg,diff_train def run(sc, sql_context, is_hive): diff_feature_reg.main(sc, sql_context, is_hive = True) diff_train.main(sc, sql_context, is_hive = True) if __name__ == "__main__": conf = SparkConf() conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "32g") sc = SparkContext(appName="bintrade_candidate", master="yarn-client", conf=conf) sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "32") sqlContext.sql("use fex") run(sc, sqlContext, is_hive=True)
from __future__ import print_function try: import findspark findspark.init() import pyspark sc = pyspark.SparkContext() sc.setLogLevel('WARN') print("spark context created") from pyspark import SparkConf, SparkContext, HiveContext from pyspark.sql import SQLContext sqlc = SQLContext(sc) sqlh = HiveContext(sc) sqlh.setConf("spark.sql.parquet.compression.codec.", "gzip") except: print("spark context exists")
def get_sql_context(sc): sqlContext = HiveContext(sc) sqlContext.setConf("hive.exec.dynamic.partition", "true") sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict") return sqlContext
def prod_src(): return { "psg_train": spark.table("prod_data.psg_train"), "psg_test": spark.table("prod_data.psg_test"), "psg_dev": spark.table("prod_data.psg_dev") } def prod_dst(): return { "psg_result": "prod_data.psg_result" } if __name__ == '__main__': spark = SparkSession.builder.appName("calc_06_task").enableHiveSupport().getOrCreate() spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") hivecontext = HiveContext(spark.sparkContext) hivecontext.setConf("hive.exec.dynamic.partition", "true") hivecontext.setConf("hive.exec.dynamic.partition.mode", "nonstrict") spark.sparkContext.setCheckpointDir("hdfs:///user/airflow/psg/calc_06_task") opts = { 'from_dt': sys.argv[1], "to_dt": "9999-12-31" } update_last_partition(prod_dst(), opts["from_dt"], opts["to_dt"]) calc_06(prod_src(), prod_dst(), opts["from_dt"], opts["to_dt"])
sqlstr = sqlstr[: len(sqlstr)-2] sqlstr += "\n) stored as orc" print sqlstr sql_context.sql(sqlstr) df.insertInto(tableName, overwrite) if __name__ == '__main__': #log.debug("debug") #a = eval("(1,[2,3])") #print "xxxxxxx",a[1][0] #a = {1: 1.0, 3: 5.5} #str_a = str(a) #a = eval(str_a) #print a[1] #print json.loads("""{1:1}""") sc = SparkContext("local[1]", appName="bintrade.ml.diff_feature") sql_context = HiveContext(sc) sql_context.sql(""" use fex_test """) sql_context.setConf("spark.sql.shuffle.partitions", "1") ldict = [{"symbol":"AAA", "date":"2010-01-01", "close":1.0}, {"symbol":"AAA","date":"2010-01-01", "close":1.0}] df = sql_context.createDataFrame(ldict) dfToTableWithPar(sql_context, df, "test_eod_AAA")
if len(sys.argv) < 6: print('Input Parameter missing', file=sys.stderr) exit(-1) sc = SparkContext(appName='SCD' + sys.argv[3]) sqlContext = HiveContext(sc) tgt_schema = sys.argv[1] tgt_tbl_nm = sys.argv[2] src_schema = sys.argv[1] src_tbl_nm = sys.argv[3] load_dt = sys.argv[4] hist_delta = sys.argv[5] src_schema_tbl = src_schema + '.' + src_tbl_nm tgt_schema_tbl = tgt_schema + '.' + tgt_tbl_nm tgt_schema_stg_tbl = tgt_schema + '.' + tgt_tbl_nm + '_tgt' sqlContext.setConf("hive.exec.dynamic.partition", "true") sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict") sqlContext.setConf("hive.execution.engine", "spark") sqlContext.setConf("hive.vectorized.execution.enabled", "true") sqlContext.setConf("hive.vectorized.execution.reduce.enabled", "true") delta_columns = [ "delta_acct_nbr", "delta_account_sk_id", "delta_zip_code", "delta_primary_state", "delta_eff_start_date", "delta_eff_end_date", "delta_load_tm", "delta_hash_key", "delta_eff_flag" ] hist_columns = [ "acct_nbr", "account_sk_id", "zip_code", "primary_state", "eff_start_date", "eff_end_date", "load_tm", "hash_key", "eff_flag" ]
df.insertInto(tableName, overwrite) if __name__ == '__main__': #log.debug("debug") #a = eval("(1,[2,3])") #print "xxxxxxx",a[1][0] #a = {1: 1.0, 3: 5.5} #str_a = str(a) #a = eval(str_a) #print a[1] #print json.loads("""{1:1}""") sc = SparkContext("local[1]", appName="bintrade.ml.diff_feature") sql_context = HiveContext(sc) sql_context.sql(""" use fex_test """) sql_context.setConf("spark.sql.shuffle.partitions", "1") ldict = [{ "symbol": "AAA", "date": "2010-01-01", "close": 1.0 }, { "symbol": "AAA", "date": "2010-01-01", "close": 1.0 }] df = sql_context.createDataFrame(ldict) dfToTableWithPar(sql_context, df, "test_eod_AAA")