def sql_hive_context_example(spark): # create hive context object. hive_ctx = HiveContext(spark.sparkContext) # createDataFrame l = [('Alice', 18), ('Bob', 20), ('Charley', 22)] df = hive_ctx.createDataFrame(l, ('name', 'age')) print("createDataFrame API finished") # registerDataFrameAsTable hive_ctx.registerDataFrameAsTable(df, "table1") print("registerDataFrameAsTable API finished") # sql tmp_df = hive_ctx.sql("select * from table1") tmp_df.show() print("sql API finished") # table tmp_df = hive_ctx.table("table1") tmp_df.show() print("table API finished") # tableNames table_names = hive_ctx.tableNames() print(table_names) print("tableNames API finished") # tables tables = hive_ctx.tables() print(tables) print("tables API finished") # range tmp_df = hive_ctx.range(1,10,2) tmp_df.show() print("range API finished") # dropTempTable hive_ctx.dropTempTable("table1") table_names = hive_ctx.tableNames() print(table_names) print("dropTempTable API finished") # cacheTable & uncacheTable & clearCache df = hive_ctx.range(1,10,2) hive_ctx.registerDataFrameAsTable(df, "table") hive_ctx.cacheTable("table") hive_ctx.uncacheTable("table") hive_ctx.clearCache() print("cacheTable & uncacheTable & clearCache API finished") # createExternalTable # newSession # registerFunction # Deprecated in 2.3.0. Use :func:`spark.udf.register` instead # registerJavaFunction # Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead # setConf & getConf hive_ctx.setConf("key1", "value1") value = hive_ctx.getConf("key1") print(value) print("setConf & getConf API finished") # refreshTable # Exception: An error occurred while calling o26.refreshTable: # Method refreshTable([class java.lang.String]) does not exist print("Finish running HiveContext API")
class HqlSpark(): # init Spark sql def __init__(self, language='cn'): self.conf = SparkConf().setAppName("HqlSparkAPP").setSparkHome( "/home/spark").set("spark.driver.allowMultipleContexts", "true") self.sc = SparkContext(conf=self.conf) self.sql = HiveContext(self.sc) self.sql.sql('use anserchapp') self.curlan = language self.curDataWord = None # self.loadWord_state = False #get current language word def getAllWord(self, language='cn'): if self.curlan == language and self.curDataWord: return self.curlan = language sql_s = 'select * from searchapp_%s limit 100' % self.curlan self.curDataWord = self.sql.sql(sql_s) def refreshTable(self, tableName): self.sql.refreshTable(tableName) #create table def createTable(self, sql_sentence, tableName): # table_name = sql_sentence.split(' ')[2] self.sql.sql(sql_sentence) self.sql.refreshTable(tableName) print 'create table success' #insert data into table def insertData(self, sql_sentence): self.sql.sql(sql_sentence) # insert data into table from data_struct def insertDataFromStruct(self, data, tableName='searchapp_', d_type='cn', state=False): #data tuple or list list data, # rdd = self.sc.parallelize(data) if d_type == '': in_data = self.sql.createDataFrame(data, als._categry_shame) elif d_type == 'hint': in_data = self.sql.createDataFrame(data, als.hintWord_shame) d_type = '' else: in_data = self.sql.createDataFrame(data, als.searchApp_shame) # final_data = in_data if state: in_data.saveAsTable(tableName=tableName + d_type, Source='metastore_db', mode='append') # append overwrite else: in_data.saveAsTable(tableName=tableName + d_type, Source='metastore_db', mode='overwrite') # delete table def deleteDataFromTable(self, table='searchapp_', d_type='ch'): sql_sentence = 'delete from ' + table + d_type self.sql.dropTempTable(table + d_type) self.sql.refreshTable(table + d_type) def showTales(self): table_list = [] tables = self.sql.sql('show tables').collect() for table in tables: table_list.append(table['tableName']) return table_list def getData(self, sql_hive): datas = self.sql.sql(sql_hive).collect() return datas #according input words find hintword from table hintword def selectHintWord(self, base_wordFr): hintWord = self.sql.sql('select word,hintWord from hintword') word = hintWord.join(base_wordFr, hintWord.hintWord == base_wordFr.word, 'outer').select(hintWord.word).distinct() word_news = self.curDataWord.join( word, word.word == self.curDataWord.word, 'outer').select(self.curDataWord.word, 'priority', 'searchApp', 'searchCount', 'genre').distinct() word_news = word_news.dropna(how='any') return word_news #according to appId find word from searchapp def selectAppIdWord(self, appIds): result = None for appId in appIds: if result == None: result = self.curDataWord.filter( functions.array_contains(self.curDataWord.searchapp, appId)).select( 'word', 'priority', 'searchApp', 'searchCount', 'genre').distinct() res = self.curDataWord.filter( functions.array_contains(self.curDataWord.searchapp, appId)).select( 'word', 'priority', 'searchApp', 'searchCount', 'genre').distinct() result = result.unionAll(res) word = result.select('word') result = result.dropna(how='any') return result, word #according to genre id find word from searchApp def selectGenreWord(self, genreIds): result = None for gId in genreIds: if result == None: result = self.curDataWord.filter( functions.array_contains(self.curDataWord.genre, gId)).select( 'word', 'priority', 'searchApp', 'searchCount', 'genre').distinct() res = self.curDataWord.filter( functions.array_contains(self.curDataWord.genre, gId)).select('word', 'priority', 'searchApp', 'searchCount', 'genre').distinct() result = result.unionAll(res) return result # get all word for analysis def getAnalysisWords(self, appIds, genreIds): if appIds == None or len(appIds) <= 0: return None appWord, word = self.selectAppIdWord(appIds) genreWord = None thinkWord = None if genreIds != None and len(genreIds) > 0: genreWord = self.selectGenreWord(genreIds) if word and word.count() > 0: thinkWord = self.selectHintWord(word) if appWord and genreWord and thinkWord: appWord = appWord.unionAll(genreWord) appWord = appWord.unionAll(thinkWord) return appWord.distinct() # return appWord.unionAll(genreWord).unionAll(thinkWord).distinct() elif appWord and genreWord: return appWord.unionAll(genreWord).distinct() elif appWord and thinkWord: appWord = appWord.unionAll(thinkWord) return appWord.distinct() elif genreWord and thinkWord: genreWord = genreWord.unionAll(thinkWord) return genreWord.distinct() elif appWord: return appWord.distinct() elif genreWord: return genreWord.distinct() else: return thinkWord.distinct() #build Matrix def buildMatrix(self, words): class_all = self.sql.sql( "select genreID from category order by genreID desc") c_genres = class_all.collect() genres = {} i = 0 for c in c_genres: genres.setdefault(c.genreID, i) i += 1 datas = words.select('genre').collect() mlength = len(c_genres) nlength = len(datas) Matrix = numpy.zeros((nlength, mlength)) num = 0 print len(Matrix) for data in datas: for ge in data.genre: Matrix[num][genres.get(ge)] = 1 num += 1 return Matrix #get Input data def getInPut(self, appIds, genreIds): words = self.getAnalysisWords(appIds, genreIds) return self.buildMatrix(words) # k_means analysis def spark_means(self, Matrix, Kcluster=2, MaxIterations=10, runs=10): cluster_data = self.sc.parallelize(Matrix) trains = KMeans().train(cluster_data, Kcluster, MaxIterations, runs) results = trains.predict(cluster_data).collect() return results #combine word def combine_data(self, words=None, result=None): len_re = len(result) len_w = words.count() if len_re != len_w: print 'word num :', len_w, ' is not equal result num:', len_re if len_re < len_w: words = self.sql.createDataFrame(words.take(len_re)) else: result = result[0:len_w] print words.count(), len(result) result = map(lambda x: str(x), result) cluster_re = self.sc.parallelize(result, 1) # print cluster_re.collect(),words.map(list).count() re = words.map(list).repartition(1).zip(cluster_re).map(lambda p: Row(word=p[0][0], priority=int(p[0][1]),\ searchcount=int(p[0][3]),cluster=p[1])) cluster_sha = self.sql.createDataFrame(re) # cluster_sha.show() return cluster_sha # select Class Word def selectWord(self, cluster_sha, top_K=2): df = cluster_sha select_par = df.groupBy('cluster').agg({ 'searchcount': 'avg', 'priority': 'avg' }).collect() ClusterNum = len(select_par) clusterWord = [] for line in select_par: cluster_df = df.filter(df.cluster == line[0]).select( 'word', 'priority', 'searchcount') ClassWord = cluster_df.filter( cluster_df.searchcount > line[1]).select('word', 'priority') ClassWord = ClassWord.filter( ClassWord.priority >= line[2]).select('*').limit(top_K) KeyWords = cluster_df.filter( cluster_df.searchcount < line[1]).select('word', 'priority') KeyWords = KeyWords.filter( KeyWords.priority >= line[2]).select("*").limit(top_K) cluster = { 'cluster_id': line[0], 'classWord': ClassWord.toJSON().collect(), 'keyWord': KeyWords.toJSON().collect() } clusterWord.append(cluster) result = {'ClusterNum': ClusterNum, 'AllCluster': clusterWord} return result
REGION as region, DIVISION as division, STATE as state, COUNTY as county, COUNTYCC as countycc, COUNTYSC as countysc, COUSUB as cousub, COUSUBCC as cousubcc, COUSUBSC as cousubsc, PLACE as place, PLACECC as placecc, AREALAND as area_land, AREAWATR as area_water, FUNCSTAT as func_stat, POP100 as pop_count, HU100 as housing_unit, INTPTLAT as latitude, INTPTLON as longitude, STATENS as state_ansi, COUNTYNS as county_ansi, COUSUBNS as county_sub_ansi, PLACENS as place_ansi FROM us_census_dataframe """) us_census.registerTempTable("us_census") us_census.write.save(output_file, mode='overwrite') sqlContext.dropTempTable("us_census_dataframe") print("Completed")