def Base_Spark(name=None, config=None, context=False): """ Get spark object. :param name: string. The name of the spark task. :param config: dict. The config of the spark task. :param context: boole. 如果想拿到sqlContext就给True. :return: spark object. """ conf = SparkConf() if config: for k, v in config.items(): conf.set(k, v) else: config = {} sc = SparkContext(conf=conf, appName=name if name else None) sc.setLogLevel("WARN") sqlContext = HiveContext(sc) if config: for k, v in config.items(): if 'hive.' in k: sqlContext.setConf(k, v) if context: return sqlContext else: spark = sqlContext.sparkSession return spark
def get_context_test(): conf = SparkConf() sc = SparkContext('local[1]', conf=conf) sql_context = HiveContext(sc) sql_context.sql("""use fex_test""") sql_context.setConf("spark.sql.shuffle.partitions", "1") return sc, sql_context
def write_to_hive(time, rdd): def process_row(x): row_dict = dict() row_dict["timestamp"] = 0 if "timestamp" not in x else x["timestamp"] row_dict["source_type"] = "" if "source.type" not in x else x["source.type"] row_dict["user_name"] = "" if "src_user_name" not in x else x["src_user_name"] row_dict["entity_name"] = "" if "ip_src_addr" not in x else x["ip_src_addr"] row_dict["guid"] = "" if "guid" not in x else x["guid"] row_dict["alert_score"] = 0.0 if "alert_score" not in x else x["alert_score"] row_dict["alerts"] = "" if "alerts" not in x else x["alerts"] row_dict["y"] = 0 if "y" not in x else x["y"] row_dict["m"] = None if "m" not in x else x["m"] row_dict["d"] = None if "d" not in x else x["d"] for numerical_colname in EVENT_MODEL_NUMERICAL_COLUMNS: row_dict[numerical_colname] = 0.0 if numerical_colname not in x else float(x[numerical_colname]) for categorical_colname in EVENT_MODEL_CATEGORICAL_COLUMNS: row_dict[categorical_colname] = "" if categorical_colname not in x else str(x[categorical_colname]) row = Row(**row_dict) return row try: spark = SparkSession \ .builder \ .appName("event-anomaly-online-score") \ .enableHiveSupport() \ .getOrCreate() hive_context = HiveContext(spark.sparkContext) hive_context.setConf("hive.exec.dynamic.partition", "true") hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict") row_rdd = rdd.map(process_row) sdf = hive_context.createDataFrame(row_rdd) sdf = sdf.drop_duplicates(subset=["guid"]) sdf.cache() source_type_list = [TENANT_NAME + "_" + data_source for data_source in DATA_SOURCE_LIST] model_dict = dict() for data_source in DATA_SOURCE_LIST: model_dict[data_source] = load_event_anomaly_model(spark=spark, data_source=data_source) for source_type in source_type_list: sdf_source = sdf.filter(sdf.source_type == source_type) if not sdf_source.rdd.isEmpty(): sdf_source.cache() database = source_type.split("_")[0] data_source = source_type.split("_")[1] table = data_source + "_event_alert_score" sdf_source.show(3) eas_sdf = get_event_anomaly_score(data_source=data_source, model_dict=model_dict, input_df=sdf_source) result_sdf = sdf_source.join(eas_sdf.select(["guid", "EAS"]), on="guid", how="left") result_sdf = result_sdf.na.fill(0.0, subset=["EAS"]) result_sdf.show(3) result_sdf.select("guid", "timestamp", "user_name", "entity_name", "source_type", "alerts", "alert_score", "EAS", "y", "m", "d").write.insertInto(database + "." + table) except Exception as e: pass
def create_context(): # Creates the Spark context sc = SparkContext(appName="hdfs2hive-test") # Creates the Hive context hiveContext = HiveContext(sc) hiveContext.setConf('hive.exec.dynamic.partition.mode', 'nonstrict') return sc, hiveContext
def process(time, rdd): print("========= %s =========" % str(time)) try: sqlContext = HiveContext(sc) # FIX: memory error Spark 2.0 bug ( < 2.0 ) sqlContext.setConf("spark.sql.tungsten.enabled","false") # v2.01 spark = SparkSession.builder \ #.master("local") \ #.appName("Word Count") \ #.config("spark.some.config.option", "some-value") \ #.getOrCreate() # Get the singleton instance of SparkSession #nzs v1.0 spark = getSparkSessionInstance(rdd.context.getConf()) if rdd.count() < 1: return; # Convert RDD[String] to RDD[Row] to DataFrame sqlRdd = rdd.map( lambda x: json.loads(x)).map(lambda r: Row( metrics=r["metrics"], name=r["name"], value=r["value"] ) ) wordsDataFrame = sqlContext.createDataFrame(sqlRdd) wordsDataFrame.show() # Creates a temporary view using the DataFrame. wordsDataFrame.registerTempTable("starwarstemp") # Creates a query and get the alam dataset using the temp table wordCountsDataFrame = sqlContext.sql("select * from starwarstemp") wordCountsDataFrame.printSchema() with open(SparkFiles.get('webinar_streaming.sql')) as test_file: alertsql=test_file.read() #logging.info(alertsql) alertDataFrame = sqlContext.sql(alertsql) alertDataFrame.show() alertDataFrame.printSchema() # save all values to HBASE # IF NEED FILTER LATER .filter(lambda x: str(x["metrics"])=='action-credit-limit') \ # create HBASE mapper rowRdd = rdd.map( lambda x: json.loads(x))\ .map(lambda r: ( str(r["metrics"]) ,[ str(r["name"])+"-"+datetime.datetime.now().strftime("%Y%m%d%H%M%S"), "action" if str(r["metrics"])=="action-credit-limit" else "healt", str(r["metrics"]), str(r["value"])] )) table = 'starwarsinbox' host = 'node-master2-KcVkz' keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter" valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter" conf = {"hbase.zookeeper.quorum": host, "hbase.mapred.outputtable": table, "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat", "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"} rowRdd.saveAsNewAPIHadoopDataset(conf=conf,keyConverter=keyConv,valueConverter=valueConv) except Exception as merror: print (merror) raise
def get_context(): conf = SparkConf() conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "8g") sc = SparkContext(appName="__file__", conf=conf) sql_context = HiveContext(sc) sql_context.sql("""use fex""") sql_context.setConf("spark.sql.shuffle.partitions", "32") return sc, sql_context
def write_spark_df_to_hdfs(spark, output_schema, database, table_name, sdf, timestamp): hive_context = HiveContext(spark.sparkContext) hive_context.setConf("hive.exec.dynamic.partition", "true") hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict") sdf = sdf.withColumn("execution_timestamp", lit(timestamp)) sdf = sdf.na.fill(0.0, subset=["pas_kmeans", "pas_isolation", "pas_svm", "pas"]) sdf.select(output_schema).write.insertInto(database + "." + table_name)
def init_session(config, app=None, return_context=False, overrides=None, use_session=False): import os from pyhocon import ConfigFactory, ConfigParser if isinstance(config, str): if os.path.exists(config): base_conf = ConfigFactory.parse_file(config, resolve=False) else: base_conf = ConfigFactory.parse_string(config, resolve=False) elif isinstance(config, dict): base_conf = ConfigFactory.from_dict(config) else: base_conf = config if overrides is not None: over_conf = ConfigFactory.parse_string(overrides) conf = over_conf.with_fallback(base_conf) else: conf = base_conf ConfigParser.resolve_substitutions(conf) res = init_spark(conf, app, use_session) if use_session: return res else: mode_yarn = conf['spark-prop.spark.master'].startswith('yarn') if mode_yarn: from pyspark.sql import HiveContext sqc = HiveContext(res) if 'hive-prop' in conf: for k, v in prop_list(conf['hive-prop']).items(): sqc.setConf(k, str(v)) else: from pyspark.sql import SQLContext sqc = SQLContext(res) if return_context: return res, sqc else: return sqc
def write_to_hive(spark, sdf, data_source_list): try: hive_context = HiveContext(spark.sparkContext) hive_context.setConf("hive.exec.dynamic.partition", "true") hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict") database = TENANT_NAME table = "anomaly_score" # sdf.show() columns = ["name", "type", "time_window", "timestamp"] + [ "as_" + data_source for data_source in data_source_list ] + ["score", "y", "m", "d"] sdf.select(columns).write.insertInto(database + "." + table) except Exception as e: print(str(e)) pass
def create_hive_session(app_name): # Import SparkConf & SparkContext lib from pyspark import SparkConf, SparkContext # Import SparkSession lib required to create hive context logger.debug("Importing pyspark.sql.HiveContext") from pyspark.sql import HiveContext logger.debug("pyspark.sql.HiveContext imported") # Create a spark context logger.debug("Creating hive context....") conf = SparkConf().setAppName(app_name) sc = SparkContext(conf=conf) # Return a hive context to the function caller hc = HiveContext(sc) hc.setConf("hive.metastore.uris", config["HIVE"]["hive.metastore.uris"]) return hc
def main(args=None): def create(): database.create_database(hc=hc, json_config=json_config) trades.createTableContracts(hc=hc, json_config=json_config) products.createTableProducts(hc=hc, json_config=json_config) def delete(): trades.deleteTableContracts(hc=hc, json_config=json_config) products.deleteTableProducts(hc=hc, json_config=json_config) database.delete_database(hc=hc, json_config=json_config) args = args_parser.parse_arguments() json_config = util_functions.load_json_config(args.json_config) sc = SparkContext.getOrCreate() hc = HiveContext(sc) hc.setConf("hive.exec.dynamic.partition", "true") hc.setConf("hive.exec.dynamic.partition.mode", "nonstrict") hc.setConf("spark.sql.hive.convertMetastoreOrc", "false") if args.action == "create": # delete() create() elif args.action == "delete": delete()
def create_dataframe_from_hive(spark_session, dbConnectionParams): df = None try: sc = SparkSession.builder.appName("Testing").config( conf=SparkConf()).enableHiveSupport().getOrCreate() sqlContext = HiveContext(sc) sqlContext.setConf( "hive.metastore.uris", "thrift://{}:{}".format(dbConnectionParams.get("host"), dbConnectionParams.get("port"))) tdf = sqlContext.sql("show databases") tdf.show() schema = DataLoader.get_db_name(dbConnectionParams) table_name = dbConnectionParams.get("tablename") df = sqlContext.table(".".join([schema, table_name])) except Exception as e: print("couldn't connect to hive") raise e return df
def write_to_hive(spark, rdd, key, time_window, timestamp, data_source): def process_line(x): row_dict = dict() row_dict["name"] = x[key] row_dict["type"] = key row_dict["time_window"] = time_window row_dict["timestamp"] = timestamp row_dict["pas_kmeans"] = x["pas_kmeans"] row_dict["pas_isolation"] = x["pas_isolation"] row_dict["pas_svm"] = x["pas_svm"] row_dict["pas"] = x["pas"] row_dict["d"] = d row_dict["m"] = m row_dict["y"] = y row = Row(**row_dict) return row try: hive_context = HiveContext(spark.sparkContext) hive_context.setConf("hive.exec.dynamic.partition", "true") hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict") database = TENANT_NAME table = data_source + "_profile_score" date = datetime.datetime.fromtimestamp(timestamp / 1000.0) d = date.day m = date.month y = date.year row_rdd = rdd.map(lambda x: process_line(x)) sdf = spark.createDataFrame(row_rdd) sdf = sdf.na.fill( 0.0, subset=["pas_kmeans", "pas_isolation", "pas_svm", "pas"]) sdf.select("name", "type", "time_window", "timestamp", "pas_kmeans", "pas_isolation", "pas_svm", "pas", "y", "m", "d").write.insertInto(database + "." + table) except Exception as e: print(str(e)) pass
def main(sc,load_id): sqlContext = HiveContext(sc) emp_table = sqlContext.sql("select emp_id,emp_name,emp_dept,gender,division from emp_table") emp_table.createOrReplaceTempView("emp_df") movies_watched = sqlContext.sql("select emp_id as emp_idm,movie_name from movies_table") movies_watched.createOrReplaceTempView("movies_df") movies_df.persist(StorageLevel.MEMORY_AND_DISK) ## Persisting movies dataframe joined_df = emp_df.alias('v1'),join(movies_df.alias('v2'),col('v1.emp_id') == col('v2.emp_idm'),inner).select(col('v1.emp_id'),col('v1.dept_name') \ ,col('v2.movie_name')) joined_df_final = joined_df.repartition(len(joined_df.select(col('emp_dept')).distinct().collect())) ## repartioning on the distinct dept_name joined_df_final.createOrReplaceTempView('temp_table') sqlContext.setConf("hive.merge.mapredfiles=false") sqlContext.setConf("hive.merge.smallfiles.avgsize=16000000") sqlContext.setConf("hive.execution.engine=mr") final_sql = 'INSERT OVERWRITE TABLE target_table select * from temp_table' sqlContext.sql(final_sql)
import os import traceback import sys from datetime import date, datetime, timedelta import time import datetime from pyspark.sql.functions import * from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext conf = SparkConf().setAppName('retail_usecase_processing') sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "10") sqlContext.setConf("hive.exec.dynamic.partition", "true") sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict") DB_NAME = "retail" #DB_NAME = sys.argv[0] date_difference_in_days = 1 #date_difference_in_days = sys.argv[1] PARTITION_DATE = (datetime.datetime.now() - timedelta(days=date_difference_in_days)).strftime('%Y-%m-%d') #PARTITION_MONTH = (datetime.datetime.now() - timedelta(months=1)).strftime('%Y-%m') PARTITION_MONTH = (date.today().replace(day=1) - timedelta(days=1)).strftime('%Y-%m') # Loading categories table categories = sqlContext.sql("""select * from {0}.categories""".format(DB_NAME))
#!/usr/bin/env Python #coding=utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql import SQLContext import pandas as pd import numpy as np # initialize spark conf = SparkConf().setMaster('local').setAppName('testApp') sc = SparkContext(conf=conf) hiveCtx = HiveContext(sc) hiveCtx.setConf("hive.exec.orc.split.strategy", "ETL") orcfile = "hdfs:///user/hive/warehouse/answer/2017-04/answer__dc8f5871_82c0_42e8_ab39_21b57b5a663a" df = hiveCtx.read.orc(orcfile) df.show()
def merge_data (sc,table_name): print ("Entered Merged data Function Testing") sqlContext = HiveContext(sc) config = read_config(['/apps/incremental/hdp/environ.properties']) input_schema_name, input_table_name = table_name.split('.') if(config == None): print "Configuration Entry Missing" sys.exit(1) # get the current branch (from local.properties) env = config.get('branch','env') # proceed to point everything at the 'branched' resources dbUrl = config.get(env+'.mysql','dbUrl') dbUser = config.get(env+'.mysql','dbUser') dbPwd = base64.b64decode(config.get(env+'.mysql','dbPwd')) dbMetastore_dbName = config.get(env+'.mysql','dbMetastore_dbName') dbApp_dbName = config.get(env+'.mysql','dbApp_dbName') bucket_name = config.get(env+'.s3','bucket_name') print (dbUrl,",",dbUser,",",dbPwd,",",dbMetastore_dbName,",",dbApp_dbName) # Connection to the Hive Metastore to get column and partition list connection = mysql.connector.connect(user=str(dbUser),password=str(dbPwd),host=str(dbUrl),database=str(dbApp_dbName)) # Get control table access try: cursor = connection.cursor() merge_sql = "SELECT * FROM application.control_table WHERE target_schemaname = '" + input_schema_name + "' and target_tablename = '" + input_table_name + "'" print merge_sql cursor.execute(merge_sql) control = cursor.fetchall() except Exception as e: print 'Issue connectining to metadata database:', e finally: connection.close() control_list = list(chain.from_iterable(control)) if not control_list: print "Control Entry missing in table" sys.exit(1) source_schema = str(control_list[1]) source_tablename = str(control_list[2]) target_schema = str(control_list[3]) target_tablename = str(control_list[4]) partitioned = control_list[5] load_type = str(control_list[6]) s3_backed = control_list[7] first_partitioned_column = str(control_list[8]) second_partitioned_column = str(control_list[9]) partitioned_column_transformation = str(control_list[10]) custom_sql = str(control_list[11]) join_columns = str(control_list[12]) archived_enabled = control_list[13] distribution_columns = str(control_list[18]) dist_col_transformation = str(control_list[19]) print distribution_columns, dist_col_transformation # Connection to the Hive Metastore to get column and partition list connection = mysql.connector.connect(user=dbUser, password=dbPwd,host=dbUrl,database=dbMetastore_dbName) # Establish connection to the hive metastore to get the list of columns try: cursor = connection.cursor() #cursor.execute("""SELECT COLUMN_NAME, TBL_NAME FROM COLUMNS_V2 c JOIN TBLS a ON c.CD_ID=a.TBL_ID where a.TBL_ID = 52""") #cursor.execute("""SELECT COLUMN_NAME FROM COLUMNS_V2 c JOIN TBLS a ON c.CD_ID=a.TBL_ID where a.TBL_ID = 52""") sql_query = "SELECT \ c.COLUMN_NAME \ FROM \ TBLS t \ JOIN DBS d \ ON t.DB_ID = d.DB_ID \ JOIN SDS s \ ON t.SD_ID = s.SD_ID \ JOIN COLUMNS_V2 c \ ON s.CD_ID = c.CD_ID \ WHERE \ TBL_NAME = " + "'" + target_tablename + "' " + " \ AND d.NAME=" + " '" + target_schema + "' " + " \ ORDER by c.INTEGER_IDX" cursor.execute(sql_query) target_result = cursor.fetchall() sql_query = "SELECT \ c.COLUMN_NAME \ FROM \ TBLS t \ JOIN DBS d \ ON t.DB_ID = d.DB_ID \ JOIN SDS s \ ON t.SD_ID = s.SD_ID \ JOIN COLUMNS_V2 c \ ON s.CD_ID = c.CD_ID \ WHERE \ TBL_NAME = " + "'" + source_tablename + "' " + " \ AND d.NAME=" + " '" + source_schema + "' " + " \ ORDER by c.INTEGER_IDX" cursor.execute(sql_query) source_result = cursor.fetchall() except Exception as e: print 'Issue running SQL in hive metadata database:', e raise finally: connection.close() # Get the column on which the table is partitioned source_select_list = ', '.join(map(''.join,source_result)) target_select_list = ', '.join(map(''.join,target_result)) if not source_select_list: print "Hive Table Not Found in metadata database" sys.exit(1) # Create the SELECT query string for fetching data from the external table if len(dist_col_transformation) > 0 : target_select_list = target_select_list source_select_list = source_select_list + ' , ' + dist_col_transformation if (partitioned): incremental_sql_query = 'select ' + source_select_list + ', ' + partitioned_column_transformation + ' from ' + source_schema + '.' + source_tablename if second_partitioned_column <> 'None': target_sql_query = 'select ' + target_select_list + ', ' + first_partitioned_column + ', ' + second_partitioned_column + ' from ' + target_schema + '.' + target_tablename else: target_sql_query = 'select ' + target_select_list + ', ' + first_partitioned_column + ' from ' + target_schema + '.' + target_tablename else: incremental_sql_query = 'select ' + source_select_list + ' from ' + source_schema + '.' + source_tablename target_sql_query = 'select ' + target_select_list + ' from ' + target_schema + '.' + target_tablename connection = mysql.connector.connect(user=dbUser, password=dbPwd,host=dbUrl,database=dbMetastore_dbName) try: cursor = connection.cursor() #cursor.execute("""SELECT COLUMN_NAME, TBL_NAME FROM COLUMNS_V2 c JOIN TBLS a ON c.CD_ID=a.TBL_ID where a.TBL_ID = 52""") #cursor.execute("""SELECT COLUMN_NAME FROM COLUMNS_V2 c JOIN TBLS a ON c.CD_ID=a.TBL_ID where a.TBL_ID = 52""") bloom_sql_query = "SELECT e.PARAM_VALUE \ FROM \ TBLS t \ JOIN DBS d \ ON t.DB_ID = d.DB_ID \ LEFT OUTER JOIN TABLE_PARAMS e \ ON t.TBL_ID = e.TBL_ID \ AND PARAM_KEY = 'orc.bloom.filter.columns' \ WHERE \ TBL_NAME = " + "'" + target_tablename + "' " + " \ AND d.NAME=" + " '" + target_schema + "' " \ cursor.execute(bloom_sql_query) bloom_filter = cursor.fetchall() except Exception as e: print 'Issue running SQL in hive metadata database:', e raise finally: connection.close() bloom_filters_columns = '' if len(bloom_filter) > 1: bloom_filters_columns = ','.join(map(''.join,bloom_filter)) bloom_filter_list = bloom_filters_columns.split "," # Execute the query to get the data into Spark Memory # Figure out if it is incremental or full load process # If Full Load then truncate the target table and insert the entire incoming data # If Incremental Load then determine if the table is partitioned as the logic needs to be handled differently for partitioned and non-partitioned tables # For Non Partitioned Tables merge the incoming data with the table date and save it to the database # For Partitioned Tables identify the partitions for which there is incremental data and intelligently merge the data and save it to the database table table_name = target_schema + '.' + target_tablename sqlContext.setConf("hive.exec.dynamic.partition", "true") sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict") sqlContext.setConf("spark.sql.orc.filterPushdown", "true") sqlContext.setConf("mapred.input.dir.recursive", "true") sqlContext.setConf("hive.mapred.supports.subdirectories", "true") sc._jsc.hadoopConfiguration().set('fs.s3a.attempts.maximum','30') if s3_backed: path = 's3a://' + bucket_name + '/' + target_schema + '/' + target_tablename + '/' else: path = '/apps/hive/warehouse/' + target_schema + '.db/' + target_tablename + '/' if second_partitioned_column <> 'None': partitioned_columns = first_partitioned_column + second_partitioned_columns else: partitioned_columns = first_partitioned_column if len (distribution_columns) > 0 : bucket_columns = partitioned_columns + distribution_columns bucket_column_list = bucket_columns.split(",") else: bucket_columns = partitioned_columns bucket_column_list = bucket_columns.split(",") if len (partitioned_columns) > 0: partition_column_list = partitioned_columns.split(",") from pyspark.sql.functions import col try: #################################################################################################################################################################### # Below logic is to sort the data based on the bloom filter columns across multiple tasks. This is the most optimal way for storing data for efficient # # reads but it takes a lot of time to load the data as the data is stored one partition at a time. We can speed up the process by persisting data but then if the # # partitions are not equally sized there are chances of shuffle reads crossing 2GB limit causing MAX_INT error. # # # # Solution for Spark 2.1 : To sort the data by task so that the performance would be better than not sorting but less efficient than the below process # # Solution for Spark 2.2 : Use the sortBy API being introduced # # # # The reason for commenting out the code instead of removing is to prove that the logic can be implemented technically in prior versions of Spark but it is very # # inefficient # #################################################################################################################################################################### # first_partitioned_list = final_df.select(first_partitioned_column) \ # .rdd.flatMap(lambda x: x).distinct().collect() # if second_partitioned_column <> 'None': # second_partitioned_list = final_df.select(second_partitioned_column)\ # .rdd.flatMap(lambda x: x).distinct().collect() # #final_df.persist() # if second_partitioned_column <> 'None': # for first_partition in first_partitioned_list: # for second_partition in second_partitioned_list: # final_path = path + first_partitioned_column + '=' + format(first_partition) + '/' + \ # second_partitioned_column + '=' + format(second_partition) # write_df = final_df.where(col(first_partitioned_column).isin(format(first_partition)) & # col(second_partitioned_column).isin(format(second_partition))) # save_df = write_df.drop(first_partitioned_column).drop(second_partitioned_column) # save_df.write.option("compression","zlib").mode("overwrite").format("orc").save(path) # else: # for first_partition in first_partitioned_list: # final_path = path + first_partitioned_column + '=' + format(first_partition) # print path # write_df = final_df.where(col(first_partitioned_column).isin(format(first_partition))) # save_df = write_df.drop(first_partitioned_column) # save_df.write.option("compression","zlib").mode("overwrite").format("orc").save(final_path) if load_type == 'FULL': merge_df = sqlContext.sql(incremental_sql_query) if partitioned: if bloom_filters_columns: final_df = merge_df.repartition(len(merge_df.select(bucket_column_list).distinct().collect()),bucket_column_list) \ .sortWithinPartitions(bloom_filter_list) else: final_df = merge_df.repartition(len(merge_df.select(bucket_column_list).distinct().collect()),bucket_column_list) final_df.write.option("compression","zlib").mode("overwrite").format("orc").partitionBy(partition_column_list).save(path) else: if merge_df.rdd.getNumPartitions() > 300: merge_coalesce_df = merge_df.coalesce(300) else: merge_coalesce_df = merge_df if bloom_filters_columns: save_df = merge_coalesce_df.sortWithinPartitions(bloom_filters_columns) save_df.write.option("compression","zlib").mode("overwrite").format("orc").save(path) else: merge_coalesce_df.write.option("compression","zlib").mode("overwrite").format("orc").save(path) # Incremental Logic for Append Only table especially for S3 elif load_type == 'APPEND_ONLY': merge_df = sqlContext.sql(incremental_sql_query) if s3_backed: temp_table = target_tablename + '_tmp' temp_path = '/apps/hive/warehouse/' + target_schema + '.db/' + temp_table + '/' else: temp_path = path print temp_path if partitioned: if bloom_filters_columns: final_df = merge_df.repartition(len(merge_df.select(bucket_column_list).distinct().collect()),bucket_column_list) \ .sortWithinPartitions(bloom_filter_list) else: final_df = merge_df.repartition(len(merge_df.select(bucket_column_list).distinct().collect()),bucket_column_list) final_df.write.option("compression","zlib").mode("append").format("orc").partitionBy(partition_column_list).save(temp_path) else: if merge_df.rdd.getNumPartitions() > 300: merge_coalesce_df = merge_df.coalesce(300) else: merge_coalesce_df = merge_df if bloom_filters_columns: save_df = merge_coalesce_df.sortWithinPartitions(bloom_filters_columns) save_df.write.option("compression","zlib").mode("append").format("orc").save(temp_path) else: merge_coalesce_df.write.option("compression","zlib").mode("append").format("orc").save(temp_path) if s3_backed: target_path = 's3a://' + bucket_name + '/' + target_schema + '/' + target_tablename source_path = 'hdfs://getnamenode/apps/hive/warehouse/' + target_schema + '.db/' + target_tablename + '_tmp' + '/*' print source_path print target_path (ret, out, err) = run_cmd(['hadoop', 'distcp', source_path, target_path]) (ret, out, err) = run_cmd(['hadoop','fs', '-rm','-r',source_path]) else: if (partitioned): from pyspark.sql.functions import col incremental_df = sqlContext.sql(incremental_sql_query) first_partitioned_list = incremental_df.select(first_partitioned_column) \ .rdd.flatMap(lambda x: x).distinct().collect() if second_partitioned_column <> 'None': second_partitioned_list = incremental_df.select(second_partitioned_column)\ .rdd.flatMap(lambda x: x).distinct().collect() merge_df = sqlContext.sql(target_sql_query)\ .where(col(first_partitioned_column).isin(first_partitioned_list) & \ col(second_partitioned_column).isin(second_partitioned_list)) else: merge_df = sqlContext.sql(target_sql_query) \ .where(col(first_partitioned_column).isin(first_partitioned_list)) join_column_list = join_columns.split(",") output_df = merge_df.join(incremental_df,join_column_list,"leftanti") final_df = output_df.union(incremental_df) if bloom_filters_columns: save_df = final_df.repartition(len(final_df.select(bucket_column_list).distinct().collect()),bucket_column_list) \ .sortWithinPartitions(bloom_filter_list) else: save_df = final_df.repartition(len(final_df.select(bucket_column_list).distinct().collect()),bucket_column_list) save_df.persist() save_df.count() #final_df.persist() if second_partitioned_column <> 'None': for first_partition in first_partitioned_list: for second_partition in second_partitioned_list: final_path = path + first_partitioned_column + '=' + format(first_partition) + '/' + \ second_partitioned_column + '=' + format(second_partition) write_df = save_df.where(col(first_partitioned_column).isin(format(first_partition)) & col(second_partitioned_column).isin(format(second_partition))) out_df.write.option("compression","zlib").mode("overwrite").format("orc").partitionBy(partition_column_list).save(path) else: for first_partition in first_partitioned_list: final_path = path + first_partitioned_column + '=' + format(first_partition) print path write_df = save_df.where(col(first_partitioned_column).isin(format(first_partition))) out_df.write.option("compression","zlib").mode("overwrite").format("orc").partitionBy(partition_column_list).save(final_path) # Incremental Update of non-partitioned table else: incremental_df = sqlContext.sql(incremental_sql_query) current_df = sqlContext.sql(target_sql_query) join_column_list = join_columns.split(",") output_df = current_df.join(incremental_df,join_column_list,"leftanti") merge_df = output_df.union(incremental_df) merge_df.persist() merge_df.count() if merge_df.rdd.getNumPartitions() > 300: merge_coalesce_df = merge_df.coalesce(300) else: merge_coalesce_df = merge_df if bloom_filters_columns: save_df = merge_coalesce_df.sortWithinPartitions(bloom_filters_columns) save_df.write.option("compression","zlib").mode("overwrite").format("orc").save(path) else: merge_coalesce_df.write.option("compression","zlib").mode("overwrite").format("orc").save(path) if (partitioned): repair_table_sql = 'MSCK REPAIR TABLE ' + table_name sqlContext.sql(repair_table_sql) refresh_metadata_sql = 'REFRESH TABLE ' + table_name sqlContext.sql(refresh_metadata_sql) sqlContext.sql(refresh_metadata_sql) except Exception as e: print 'Exception while loading data:', e # coding=utf-8 sys.exit(1) if archived_enabled: target_path = 's3a://' + bucket_name + '/' + target_schema + '/' + target_tablename + '_bkp/' if s3_backed: source_path = 's3a://' + bucket_name + '/' + source_schema + '/' + source_tablename + '/' else: source_path = 'hdfs://apps/hive/warehouse/' + source_schema + '.db/' + source_tablename + '/*' print source_path print target_path (ret, out, err) = run_cmd(['hadoop', 'distcp', source_path, target_path]) print "Errors:",err
outlier_mean.astype(str), z_score.astype(str), normal_std.astype(str), outlier_std.astype(str))) res = '\t'.join(['\t'.join(r) for r in res]) abnormal_features.append('\t'.join(row.astype(str).values.tolist()) + '\t' + res) return abnormal_features if __name__ == "__main__": sparkConf = SparkConf() sparkConf.setAppName("dagang abnormal segment") sparkConf.set("spark.kryoserializer.buffer.max", "128") sc = SparkContext(conf=sparkConf) sc.setLogLevel("WARN") sqlCtx = HiveContext(sc) sqlCtx.setConf("spark.sql.parquet.binaryAsString", "true") sqlCtx.setConf("spark.sql.hive.convertMetastoreParquet", "true") sqlCtx.setConf("spark.sql.parquet.int96AsTimestamp", "true") executor_cores = int(sparkConf.get('spark.executor.cores')) num_executors = int(sparkConf.get('spark.executor.instances')) num_partitions = executor_cores * num_executors * 3 features = [ 'area', 'down_oscillation', 'down_stroke', 'down_stroke_ratio', 'down_stroke_zaihe', 'down_up_oscillation_ratio', 'down_up_stroke_zaihe_ratio', 'down_up_zaihe_ratio', 'down_zaihe', 'left_upper_area', 'left_upper_area_ratio', 'max_weiyi', 'max_weiyi_zaihe', 'max_zaihe', 'min_max_zaihe_ratio', 'min_weiyi', 'min_weiyi_zaihe', 'min_zaihe', 'up_oscillation', 'up_stroke', 'up_stroke_ratio', 'up_stroke_zaihe', 'up_zaihe'
def run(yarn=None, verbose=None, campaign=None, tier=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) sqlContext = HiveContext(ctx) sqlContext.setConf("spark.sql.files.ignoreCorruptFiles", "true") sqlContext.sql("set spark.sql.files.ignoreCorruptFiles=true") df = sqlContext.read.format('com.databricks.spark.csv')\ .options(header='true', treatEmptyValuesAsNulls='true', nullValue='null')\ .load('hdfs:///cms/aggregation/sizes/part-*') if campaign != None and tier != None: campaign_tier_df = df.where(df.campaign == campaign)\ .where(df.tier == tier) campaign_tier = map(lambda row: row.asDict(), campaign_tier_df.collect()) print 'Average size: %s' % bytes_to_readable( float(campaign_tier[0]['size_average'])) print 'Average in period of existence: %s' % bytes_to_readable( float(campaign_tier[0]['average_size_in_period'])) print 'Max size: %s' % bytes_to_readable( float(campaign_tier[0]['size_max'])) print 'T1 size: %s' % bytes_to_readable( float(campaign_tier[0]['t1_size'])) print 'T2 size: %s' % bytes_to_readable( float(campaign_tier[0]['t2_size'])) print 'T3 size: %s' % bytes_to_readable( float(campaign_tier[0]['t3_size'])) date_to_timestamp_udf = udf(lambda date: time.mktime( datetime.datetime.strptime(date, "%Y%m%d").timetuple())) months = [1, 2, 3, 4, 5, 6, 9, 12] for month in months: now = (datetime.datetime.now() - datetime.datetime(1970, 1, 1)).total_seconds() seconds = month * 30 * 24 * 60 * 60 not_accessed_df = df.withColumn( 'date_timestamp', date_to_timestamp_udf(df.last_access_date)) not_accessed_df = not_accessed_df.where( now - not_accessed_df.date_timestamp > seconds) not_accessed_df = not_accessed_df.withColumn( "size_average", not_accessed_df["size_average"].cast(DoubleType())) total_size = not_accessed_df.groupBy().sum('size_average').rdd.map( lambda x: x[0]).collect()[0] or 0 print 'Size of data not accessed for last %d month(s): %s' % ( month, bytes_to_readable(total_size)) ctx.stop()
def process(time, rdd): print("========= %s =========" % str(time)) try: sqlContext = HiveContext(sc) # FIX: memory error Spark 2.0 bug ( < 2.0 ) sqlContext.setConf("spark.sql.tungsten.enabled", "false") if rdd.count() < 1: return sqlRdd = rdd.map(lambda x: json.loads(x)).map( lambda r: Row(messageid=r["messageid"], messagedate=datetime.datetime.strptime( r["messagedate"], '%Y%m%d%H%M%S'), value=r["value"], metrics=r["metrics"], name=r["name"])) speedDataFrame = sqlContext.createDataFrame(sqlRdd) batch_table_name = config.get_lambda_config( "lambda_speedlayer", "speed_batch_table") speedDataFrame.write.mode("append").saveAsTable(batch_table_name) # if S3 vals defined then save also to OBS (s3) s3_full_path = config.get_lambda_config("lambda_speedlayer", "s3_full_path") if s3_full_path and False: speedDataFrame.write.parquet(s3_full_path, mode="append") speedDataFrame.show() # Creates a temporary view using the DataFrame. temp_table_name = config.get_lambda_config("lambda_speedlayer", "speed_temp_table") speedDataFrame.registerTempTable(temp_table_name) if __debug__: speedDataFrame.printSchema() speedDataFrame.head(10) # handling sql alert file alertsqlfile = config.get_lambda_config("lambda_speedlayer", "alert_sql_path") alertsql = load_resource_file(alertsqlfile) # Execute alarm query and get the alam dataset using the temp table alertDataFrame = sqlContext.sql(alertsql) alertDataFrame.show() alertDataFrame.printSchema() # save all values to HBASE # IF NEED FILTER LATER .filter(lambda x: str(x["metrics"])=='action-credit-limit') \ # create HBASE mapper rowRdd = rdd.map( lambda x: json.loads(x))\ .map(lambda r: ( str(r["metrics"]) ,[ str(r["name"])+"-"+datetime.datetime.now().strftime("%Y%m%d%H%M%S"), "driver" if "driver" in str(r["metrics"]) else "car", str(r["metrics"]), str(r["value"]) ] )) table = config.get_lambda_config("lambda_speedlayer", "speed_inbox_table") host = config.get_lambda_config("lambda_speedlayer", "hbase_host") keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter" valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter" conf = { "hbase.zookeeper.quorum": host, "hbase.mapred.outputtable": table, "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat", "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable" } rowRdd.saveAsNewAPIHadoopDataset(conf=conf, keyConverter=keyConv, valueConverter=valueConv) except Exception as streamerror: logging.error("Stream error:", streamerror) print(streamerror) raise
def run(fout, yarn=None, verbose=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) sqlContext = HiveContext(ctx) sqlContext.setConf("spark.sql.files.ignoreCorruptFiles","true") sqlContext.sql("set spark.sql.files.ignoreCorruptFiles=true") # date, site, dataset, size, replica_date, groupid schema = StructType([ StructField("date", StringType(), True), StructField("site", StringType(), True), StructField("dataset", StringType(), True), StructField("size", DoubleType(), True), StructField("replica_date", StringType(), True), StructField("groupid", StringType(), True) ]) df = sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(PHEDEX_HDFS_URL, schema=schema) # Remove all tape sites is_tape = lambda site: site.endswith('_MSS') | site.endswith('_Buffer') | site.endswith('_Export') df = df.where(is_tape(df.site) == False) # Remove all non VALID datasets remove_invalid_datasets(df, sqlContext, verbose) # Get accesses data frame accesses_df = get_dataset_access_dates(sqlContext) # extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2].split('-')[0]) extract_tier_udf = udf(lambda dataset: dataset.split('/')[3]) days_delta_udf = udf(lambda t1, t2: (datetime.datetime.fromtimestamp(float(t1)) - datetime.datetime.fromtimestamp(float(t2))).days + 1) count_udf = udf(lambda list: len(list)) get_t1_size = udf(lambda size, site: size if site.startswith('T1') else 0) get_t2_size = udf(lambda size, site: size if site.startswith('T2') else 0) get_t3_size = udf(lambda size, site: size if site.startswith('T3') else 0) df = df.withColumn('campaign', get_extract_campaign_udf()(df.dataset))\ .withColumn('tier', extract_tier_udf(df.dataset))\ .withColumn('date_min', get_date_to_timestamp_udf()(df.date))\ .withColumn('date_max', get_date_to_timestamp_udf()(df.date))\ .withColumn('size_average', df.size)\ .withColumn('t1_size', get_t1_size(df.size, df.site))\ .withColumn('t2_size', get_t2_size(df.size, df.site))\ .withColumn('t3_size', get_t3_size(df.size, df.site)) df = df.groupBy(['campaign', 'tier'])\ .agg({'date_min': 'min', 'date_max': 'max', 'date': 'collect_set', 'size_average': 'avg', 'size': 'max', 't1_size': 'avg', 't2_size': 'avg', 't3_size': 'avg'})\ .withColumnRenamed('min(date_min)', 'date_min')\ .withColumnRenamed('max(date_max)', 'date_max')\ .withColumnRenamed('collect_set(date)', 'days_count')\ .withColumnRenamed('avg(size_average)', 'size_average')\ .withColumnRenamed('max(size)', 'size_max')\ .withColumnRenamed('avg(t1_size)', 't1_size')\ .withColumnRenamed('avg(t2_size)', 't2_size')\ .withColumnRenamed('avg(t3_size)', 't3_size')\ df = df.withColumn('period_days', days_delta_udf(df.date_max, df.date_min))\ .withColumn('days_count', count_udf(df.days_count))\ .withColumn('date_min', get_timestamp_to_date_udf()(df.date_min))\ .withColumn('date_max', get_timestamp_to_date_udf()(df.date_max)) df = df.withColumn('existence_in_period', df.days_count / df.period_days) df = df.withColumn('average_size_in_period', df.size_average * df.existence_in_period) df.show(100, truncate=False) # campaign, tier, date_max, date_min, days_count, size_max, size_average, period_days, existence_in_period, average_size_in_period, t1_size, t2_size, t3_size, last_access_date df = df.join(accesses_df, 'campaign') df.show(100, truncate=False) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: df.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
adjclose float ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' """) sqlContext.sql(""" use fex """) df = sqlContext.sql(""" SELECT * FROM eod_spx WHERE symbol = "SPX" AND date >= "2010-01-01" AND date <= "2010-06-30" """) sqlContext.sql(""" use fex_test """) df.repartition(1).insertInto("eod_spx", True) if __name__ == "__main__": conf = SparkConf(); conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "8g") sc = SparkContext(appName=__file__, conf = conf) sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "1") main(sc, sqlContext) sc.stop()
import sys import string from datetime import date, timedelta from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from scp import SCPClient from pyspark.sql import functions as F from subprocess import call APP_NAME = "con" sc = SparkContext("", APP_NAME) sc.setLogLevel("WARN") sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.parquet.binaryAsString", "true") PROD = "12.333.201.21" ## Loop through DATE partitions yesterday = datetime.datetime.now( pytz.timezone('US/Central')).date() - timedelta(1) dayte = str( yesterday.strftime("%Y") + '-' + yesterday.strftime("%m") + '-' + yesterday.strftime("%d")) def daterange(start_date, end_date): for n in range(int((end_date - start_date).days)): yield start_date + timedelta(n)
from __future__ import print_function import os import sys from pyspark import SparkContext from pyspark.sql import SQLContext, HiveContext from pyspark.sql.readwriter import DataFrameWriter from pyspark.sql.types import * from pyspark.sql.types import * if __name__ == "__main__": sc = SparkContext(appName="PythonSQL") sqlContext = SQLContext(sc) hiveContext = HiveContext(sc) hiveContext.setConf("hive.exec.dynamic.partition", "true") hiveContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict") hiveContext.setConf("spark.sql.orc.filterPushdown", "true") ## Create a DataFrame from the file(s) pointed to by path gwcdr = hiveContext.read.json("/user/wgovea/people.json.gz") # The inferred schema can be visualized using the printSchema() method. gwcdr.printSchema() # Register this DataFrame as a table. gwcdr.registerTempTable("gwcdr_tmp") data = hiveContext.sql( "SELECT name,age,country,ts from gwcdr_tmp").write.format( "orc").partitionBy("ts").mode("append").insertInto("people")
from pyspark.sql.functions import regexp_replace, col, udf from langdetect import detect from spacy.lemmatizer import Lemmatizer from spacy.lang.en import LOOKUP as enlook from spacy.lang.de import LOOKUP as delook from pyspark.sql.types import StringType from pyspark.ml.feature import Tokenizer, StopWordsRemover from pyspark.sql.functions import explode from pyspark.sql import HiveContext import pyspark.sql.functions as func conf = SparkConf().setAppName('MyFirstStandaloneApp') sc = SparkContext(conf=conf) #sqlContext = sql.SQLContext(sc) hiveContext = HiveContext(sc) hiveContext.setConf("hive.metastore.uris", "thrift://s12m.westeurope.cloudapp.azure.com:9083") class WordCount: def __init__(self, dataframe): self.dataframe = dataframe def transform(self): df2 = self.dataframe.withColumn( "_2", regexp_replace(col("_2"), "[\"'./§$&+,:;=?@#–|'<>.^*()%!-]", "")) df = df2.withColumn("_2", regexp_replace(col("_2"), "\\s{2,}", "")) language_detect = udf(lambda x: detect(x), returnType=StringType()) df3 = df.withColumn("lang", language_detect('_2'))
adjclose float ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' """) sqlContext.sql(""" use fex """) df = sqlContext.sql(""" SELECT * FROM eod_spx WHERE symbol = "SPX" AND date >= "2010-01-01" AND date <= "2010-06-30" """) sqlContext.sql(""" use fex_test """) df.repartition(1).insertInto("eod_spx", True) if __name__ == "__main__": conf = SparkConf() conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "8g") sc = SparkContext(appName=__file__, conf=conf) sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "1") main(sc, sqlContext) sc.stop()
def sql_hive_context_example(spark): # create hive context object. hive_ctx = HiveContext(spark.sparkContext) # createDataFrame l = [('Alice', 18), ('Bob', 20), ('Charley', 22)] df = hive_ctx.createDataFrame(l, ('name', 'age')) print("createDataFrame API finished") # registerDataFrameAsTable hive_ctx.registerDataFrameAsTable(df, "table1") print("registerDataFrameAsTable API finished") # sql tmp_df = hive_ctx.sql("select * from table1") tmp_df.show() print("sql API finished") # table tmp_df = hive_ctx.table("table1") tmp_df.show() print("table API finished") # tableNames table_names = hive_ctx.tableNames() print(table_names) print("tableNames API finished") # tables tables = hive_ctx.tables() print(tables) print("tables API finished") # range tmp_df = hive_ctx.range(1,10,2) tmp_df.show() print("range API finished") # dropTempTable hive_ctx.dropTempTable("table1") table_names = hive_ctx.tableNames() print(table_names) print("dropTempTable API finished") # cacheTable & uncacheTable & clearCache df = hive_ctx.range(1,10,2) hive_ctx.registerDataFrameAsTable(df, "table") hive_ctx.cacheTable("table") hive_ctx.uncacheTable("table") hive_ctx.clearCache() print("cacheTable & uncacheTable & clearCache API finished") # createExternalTable # newSession # registerFunction # Deprecated in 2.3.0. Use :func:`spark.udf.register` instead # registerJavaFunction # Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead # setConf & getConf hive_ctx.setConf("key1", "value1") value = hive_ctx.getConf("key1") print(value) print("setConf & getConf API finished") # refreshTable # Exception: An error occurred while calling o26.refreshTable: # Method refreshTable([class java.lang.String]) does not exist print("Finish running HiveContext API")
select * from tmp_table """.format(table_name=table_name, dt=dt, version=version) print(insert_sql) hiveCtx.sql(insert_sql) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-d", "--date", help="work date", default="1") parser.add_argument("-v", "--version", help="version", default="1") args = parser.parse_args() print("%s parameters:%s" % (sys.argv[0], args)) begin_time = time.time() print("%s begin at %s" % (sys.argv[0], str(datetime.datetime.now()))) conf = SparkConf() sc = SparkContext(conf=conf, appName="sp_ind") sc.setLogLevel("WARN") hiveCtx = HiveContext(sc) hiveCtx.setConf('spark.shuffle.consolidateFiles', 'true') hiveCtx.setConf('spark.sql.shuffle.partitions', '1000') hiveCtx.sql('use app') dt = '2020-05-25' version = 'query-similar-month' # Create table. create_table() #negative_sampling(dt, version) postive_sampling(dt, version)
ddl_str = '%s %s' % (col_name, col_dtype) projection_str = ' %s ' % (col_name) ddl_list.append(ddl_str) select_list.append(projection_str) ddl_str = """create external table output_table_omtr( %s ) stored as parquet location 's3://move-dataeng-temp-dev/glue-etl/parquet_block_poc/hit_data_pdt_512mb_ctas/' """ % (','.join(ddl_list)) #--- print 'ddl_str = ', ddl_str print 'select_str = ', ','.join(select_list) sqlContext.sql(ddl_str) df.createOrReplaceTempView("hit_data_big") sqlContext.setConf("hive.exec.dynamic.partition", "true") sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict") print df.count() sqlContext.sql("show tables").show() #----- Hive section ---- hadoopConf = {} iterator = sc._jsc.hadoopConfiguration().iterator() while iterator.hasNext(): prop = iterator.next() hadoopConf[prop.getKey()] = prop.getValue() for item in sorted(hadoopConf.items()): print(item) for item in sorted(sc._conf.getAll()):
def create_hive_context(spark_context): hive_context = HiveContext(spark_context) hive_context.setConf('hive.exec.dynamic.partition.mode', 'nonstrict') hive_context.setConf('hive.exec.max.dynamic.partitions', '17520') return hive_context
Fails after 2+ hours. Problem seems to be "(Too many open files)" Likely several thousand files are open at one time. """ from pyspark import SparkContext from pyspark.sql import HiveContext sc = SparkContext() sqlContext = HiveContext(sc) # snappy compression recommended for Arrow # Interesting- snappy is slightly smaller than gz for the 10 rows. sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy") # Testing #pems = sqlContext.sql("SELECT * FROM pems LIMIT 10") # This works # pems = sqlContext.sql("SELECT * FROM pems WHERE station IN (402265, 402264, 402263, 402261, 402260)") pems = sqlContext.sql("SELECT * FROM pems ORDER BY station") # Don't see options about file chunk sizes, probably comes from some # environment variable # Later versions: # pems.write.parquet("pems_sorted", compression = "snappy") #pems.write.parquet("pems_station", partitionBy="station")