def main(): conf = SparkConf().setAppName("transform") sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) conn = "mongodb://{mongo_user}:{mongo_pass}@{mongo_host}:{mongo_port}/{mongo_db}.{mongo_collection}".format( mongo_user=MONGO_USER, mongo_pass=urllib.quote_plus(MONGO_PASSWORD), mongo_host=MONGO_HOST, mongo_port=MONGO_PORT, mongo_db=DB_NAME, mongo_collection=COLLECTION_NAME) rdd = sc.mongoRDD(conn) new_rdd = rdd.map(lambda x: dict([(i, x[i]) for i in x if i != '_id']) ).map(lambda x: json.dumps(x, ensure_ascii=False).encode('ascii', 'replace') ).map(lambda x: "".join(x.split("\\n"))) df = sqlContext.jsonRDD(new_rdd) df.registerTempTable('events_temp') sqlContext.sql('DROP TABLE IF EXISTS default.events') sqlContext.sql('DROP TABLE IF EXISTS default.clean_table') sqlContext.sql('CREATE TABLE events AS SELECT * FROM events_temp') sqlContext.sql("CREATE TABLE clean_table AS SELECT description AS event_desc, id AS event_id, yes_rsvp_count, group.category.name AS cat_name, group.category.shortname AS cat_short, group.category.id AS cat_id, group.name AS group_name, group.topics.name AS topic_name, name AS event_name, time AS start_time, utc_offset AS timezone_offset, venue.state AS venue_state, venue.city AS venue_city, venue.zip AS venue_zip, fee.amount AS fee_amt, fee.required AS req_fee FROM events")
def run(inpath, outpath, mode='append'): gc.disable() print("===== Checking if Log Exists =====") check_log(inpath) print("===== Pass Log Checking =====") # initial SparkContext conf = SparkConf().setAppName("Forgate Log Parser") sc = SparkContext(conf=conf) sqlCtx = HiveContext(sc) start_time = time.time() print("===== INPUT FILE PATH: %s =====" % (str(inpath))) print("===== OUTPUT FILE PATH: %s =====" % (str(outpath))) print("===== %s Reading Data From HDFS" % (now())) distFile = sc.textFile(inpath) cnt_raw = distFile.count() print("===== Count of Input Data: %s =====" % (str(cnt_raw))) print("===== %s Parsing Data" % (now())) parsedData = parse_data(sc, distFile) print("===== Count of Parsed Data: %s =====" % (str(parsedData.count()))) print("===== %s Saving Data" % (now())) jsonData = sqlCtx.jsonRDD(parsedData) old_col=['time','date'] new_col=['time_','dt'] jsonData = rename_column(jsonData, old_col, new_col) jsonData.write.partitionBy('dt').parquet(outpath, mode=mode) print("===== %s Checking Data" % (now())) confirm_row(sqlCtx, outpath) write_log(inpath) print("---Total took %s seconds ---" % (time.time() - start_time)) sc.stop() gc.enable()
# coding=utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext, Row conf = SparkConf().setAppName("spark_sql_cache") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize( ['{"col1": "row1_col1","col2":"row1_col2","col3":"row1_col3"}', '{"col1": "row2_col1","col2":"row2_col2","col3":"row2_col3"}', '{"col1": "row3_col1","col2":"row3_col2","col3":"row3_col3"}']) sourceRDD = hc.jsonRDD(source) sourceRDD.registerTempTable("temp_source") """ def convert(row): mydict = row.asDict() mydict["col1"] = mydict["col1"].upper() return Row(**mydict) convertRDD = hc.sql( "select col1, col2, col3 from temp_source").map(convert) mytable = hc.inferSchema(convertRDD)
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext, Row import re conf = SparkConf().setAppName("spark_sql_json") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize( ['{"col1": "row1_col1","col2":"row1_col2","col3":"row1_col3"}', '{"col1": "row2_col1","col2":"row2_col2","col3":"row2_col3"}', '{"col1": "row3_col1","col2":"row3_col2","col3":"row3_col3"}']) table = hc.jsonRDD(source) table.registerAsTable("temp_mytable") datas = hc.sql("select * from temp_mytable").collect() sc.stop() if datas: for data in datas: print data.col1, data.col2, data.col3
header_stores = ['store_id', 'avg_hhi', 'avg_traffic'] df_stores.repartition(1).write.parquet(df_stores_loc, mode='overwrite') df_stores_join = df_stores.select(header_stores + ['group_val']) # In[7]: ################# 2.1.3 sales data ## parsing sales json file and construct df_sales sales_dates = [(processed_time_d[0] + timedelta(i + 1)).strftime('%Y_%m_%d') for i in range((processed_time_d[1] - processed_time_d[0]).days) ] sales_raw_file_name = sales_and_pc_raw_d_loc + 'sales_store[0-9]*_{' + ','.join( sales_dates) + '}_00_00_00.json' sales_jsonRDD = sc.wholeTextFiles(sales_raw_file_name).map(lambda x: x[1]) sales_js = sales_jsonRDD.map(lambda x: re.sub(r"\s+", "", x, flags=re.UNICODE)) sales_js = sqlContext.jsonRDD(sales_js) sales_js = sales_js.select(col('SalesLogDateTime'), col('StoreID'), explode(col('Transactions')).alias('Transaction')) sales_js = sales_js.select([ 'SalesLogDateTime', 'StoreID', 'Transaction.Products', 'Transaction.Subtotal', 'Transaction.Tax', 'Transaction.Total', 'Transaction.TransactionDateTime' ]) sales_js = sales_js.select(col('Subtotal'), col('StoreID'), col('Tax'), col('SalesLogDateTime'), col('Total'), col('TransactionDateTime'), explode(col('Products')).alias('Product')) sales_js = sales_js.select([ 'Subtotal', 'StoreID', 'Tax', 'SalesLogDateTime', 'Total', 'TransactionDateTime', 'Product.Price', 'Product.ProductID' ])
# values的输出结果:Row(_c0=u'1', _c1=u'2', _c2=u'3.0'),数据类型被全部推断为“int”,也就是说数组的数据类型一定要一致,否则可以引发异常 """ """ source = sc.parallelize(['{"key" : [1, 2 , 3.0]}']) jsonRDD = hc.jsonRDD(source) jsonRDD.registerTempTable("temp_table") values = hc.sql("select key[0], key[1], key[2] from temp_table").collect() # values的输出结果:Row(_c0=1.0, _c1=2.0, _c2=3.0),数据类型被全部推断为“float” """ source = sc.parallelize([ '{"key" : [{"key1" : "value1", "key2" : [1, 2, 3], "key3" : [{"key4" : "value4", "key5" : [4, 5.0, 6]}]}]}' ]) jsonRDD = hc.jsonRDD(source) jsonRDD.registerTempTable("temp_table") values = hc.sql( "select key[0].key1, key[0].key2[0], key[0].key3[0].key4, key[0].key3[0].key5[1] from temp_table" ).collect() sc.stop() for value in values: print value
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext, Row import re conf = SparkConf().setAppName("spark_sql_json") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([ '{"col1": "row1_col1","col2":"row1_col2","col3":"row1_col3"}', '{"col1": "row2_col1","col2":"row2_col2","col3":"row2_col3"}', '{"col1": "row3_col1","col2":"row3_col2","col3":"row3_col3"}' ]) table = hc.jsonRDD(source) table.registerAsTable("temp_mytable") datas = hc.sql("select * from temp_mytable").collect() sc.stop() if datas: for data in datas: print data.col1, data.col2, data.col3