def calc_python_spark(ask_parm, crowd_id): D = ask_parm['D_F'] F = ask_parm['F_F'] F_Group = ask_parm['F_Group'] option_dict, option_list = utils.option_dict_list(F) sc = Tsparkcore.createSparkContext("crowd_ANOVA_TWO_WAY") sqlctx = SQLContext(sc) hbase_rdd = Tsparkcore.createSparkRDD(sc, HBASE_CROWD) hbase_rdd = Tsparkcore.calHbaseRDD(hbase_rdd, "EXAM_LIST", crowdID=crowd_id) output_hbase_rdd = hbase_rdd.map(lambda kv: Tsparkcore.map_crowd_Row(kv[1])) schemaCrowd = sqlctx.createDataFrame(output_hbase_rdd, samplingRatio=0.2) schemaCrowd.registerTempTable("statics_crowd_details") crowdDataFrame = sqlctx.sql("SELECT codeID, organID, examID FROM statics_crowd_details") data_rdd = Tsparkcore.createSparkRDD(sc, HBASE_STANDARD) data_rdd = Tsparkcore.calHbaseRDD(data_rdd, "EXAM_JSON") output_data_rdd = data_rdd.map(lambda kv: Tsparkcore.map_std_Row_test(kv[1], option_list)) schemaStd = sqlctx.createDataFrame(output_data_rdd, samplingRatio=0.2) schemaStd.registerTempTable("base_standard") stdDataFrame = sqlctx.sql( "SELECT codeID, organID, data_json, examTime, rowKey, examID FROM base_standard") cond = [crowdDataFrame.codeID == stdDataFrame.codeID, crowdDataFrame.organID == stdDataFrame.organID, crowdDataFrame.examID == stdDataFrame.examID] rsDataFrame = crowdDataFrame.join(stdDataFrame, cond, "inner").select(stdDataFrame.data_json) rsDataFrameGArrs = {} for rowKey in rsDataFrame.collect(): for colKey in option_list: rsDataFrameGArrs[colKey] = [] if (rowKey.has_key(colKey)): rsDataFrameGArrs[colKey].append(rowKey[colKey]) else: rsDataFrameGArrs[colKey].append(None) frame_Arrs = [] for colKey in option_list : group_list = F_Group[colKey] rsDataFrameG = get_df_group(colKey, group_list, rsDataFrameGArrs) formula_list = [] # rsDataFrameG1 = get_df_group(group_all["390"][0], rsDataFrame) # rsDataFrameG2 = get_df_group(group_all["390"][1], rsDataFrame) # rsDataFrameG3 = get_df_group(group_all["390"][2], rsDataFrame) frame = DataFrame() # G1 = rsDataFrameG1.select(rsDataFrameG1.data_json["390"]).toPandas() # G2 = rsDataFrameG2.select(rsDataFrameG2.data_json["390"]).toPandas() # G3 = rsDataFrameG3.select(rsDataFrameG2.data_json["390"]).toPandas() frame['D'] = G1 i=0 for key in option_list: i = i+1; formula_list.append('F'+str(i)) frame['F'+str(i)] = frame_Arrs[i-1] formula = "D~ " + "+".join(formula_list) print 'formula', formula anova_results = anova_lm(ols(formula, frame).fit()) result = {} import json result['1'] = anova_results.to_json() print "anova_results", anova_results sc.stop() return None
def calc_python_spark(ask_parm, crowd_id): group1 = ask_parm["group1"] group2 = ask_parm["group2"] print 'group1', group1, ', group2', group2 option_dict, option_list = utils.option_dict_list(ask_parm['optionList']) print "option_dict", option_dict option_list.append("390") group1["parmId"] = "390" group2["parmId"] = "390" sc = Tsparkcore.createSparkContext("crowd_Pair_test") sqlctx = SQLContext(sc) hbase_rdd = Tsparkcore.createSparkRDD(sc, HBASE_CROWD) hbase_rdd = Tsparkcore.calHbaseRDD(hbase_rdd, "EXAM_LIST", crowdID=crowd_id) output_hbase_rdd = hbase_rdd.map(lambda kv: Tsparkcore.map_crowd_Row(kv[1])) schemaCrowd = sqlctx.createDataFrame(output_hbase_rdd, samplingRatio=0.2) schemaCrowd.registerTempTable("statics_crowd_details") crowdDataFrame = sqlctx.sql("SELECT codeID, organID, examID FROM statics_crowd_details") data_rdd = Tsparkcore.createSparkRDD(sc, HBASE_STANDARD) data_rdd = Tsparkcore.calHbaseRDD(data_rdd, "EXAM_JSON") output_data_rdd = data_rdd.map(lambda kv: Tsparkcore.map_std_Row(kv[1], group1, option_list)) schemaStd = sqlctx.createDataFrame(output_data_rdd, samplingRatio=0.2) schemaStd.registerTempTable("base_standard") stdDataFrame = sqlctx.sql( "SELECT codeID, organID, examID, examTime, data_json, group_Type FROM base_standard").filter("group_Type=true") # stdDataFrame.show() cond = [crowdDataFrame.codeID == stdDataFrame.codeID, crowdDataFrame.organID == stdDataFrame.organID, crowdDataFrame.examID == stdDataFrame.examID] rsDataFrame = crowdDataFrame.join(stdDataFrame, cond, "inner").select(crowdDataFrame.codeID, crowdDataFrame.organID, crowdDataFrame.examID, stdDataFrame.examTime, stdDataFrame.data_json) rsDataFrameG1 = rsDataFrame.filter(stdDataFrame.data_json["390"] > 17).orderBy(crowdDataFrame.codeID, stdDataFrame.examTime) rsDataFrameG2 = rsDataFrame.filter(stdDataFrame.data_json["390"] < 17).orderBy(crowdDataFrame.codeID, stdDataFrame.examTime) rsDataFrameG1RDD = rsDataFrameG1.map(lambda p: p.data_json) rsDataFrameG2RDD = rsDataFrameG2.map(lambda p: p.data_json) rsDataFrameG1Arrs = get_key_arrs(rsDataFrameG1RDD, option_list) rsDataFrameG2Arrs = get_key_arrs(rsDataFrameG2RDD, option_list) print "rsDataFrameG1Arrs ", rsDataFrameG1Arrs print "rsDataFrameG2Arrs ", rsDataFrameG2Arrs result = {} for colKey in option_list: if colKey is "390": break name = option_dict[colKey] print "name =>", name result[name] = get_result_arrs(rsDataFrameG1Arrs[colKey], rsDataFrameG2Arrs[colKey]) print result sc.stop() return json.dumps(result, ensure_ascii=False)
def calc_python_spark(ask_parm, crowd_id): ask_parm['D_F'] = ask_parm['D_F'].split(":")[0] option_dict, option_list = utils.option_dict_list(ask_parm['F_F']) arrs = ask_parm['F_F'].split(",") list = [] for key in arrs: list.append(key.split(":")[0]) ask_parm['F_F'] =','.join(list) D = ask_parm['D_F'] F = ask_parm['F_F'] F_Arrs = F.split(',') print "option_dict", option_dict option_list.append(D) arrs = F.split(",") for arr in arrs: option_list.append(arr) group1 = {} group1['parmId'] = 390 group1['parmVal'] = "12-14" group1['parmType'] = 1 group2 = {} group2['parmId'] = 390 group2['parmVal'] = "12-14" group2['parmType'] = 1 group_all = {} group_all["390"] = [group1, group2, group2] sc = Tsparkcore.createSparkContext("crowd_ANOVA_TWO_WAY") sqlctx = SQLContext(sc) hbase_rdd = Tsparkcore.createSparkRDD(sc, HBASE_CROWD) hbase_rdd = Tsparkcore.calHbaseRDD(hbase_rdd, "EXAM_LIST", crowdID=crowd_id) output_hbase_rdd = hbase_rdd.map(lambda kv: Tsparkcore.map_crowd_Row(kv[1])) schemaCrowd = sqlctx.createDataFrame(output_hbase_rdd, samplingRatio=0.2) schemaCrowd.registerTempTable("statics_crowd_details") crowdDataFrame = sqlctx.sql("SELECT codeID, organID, examID FROM statics_crowd_details") data_rdd = Tsparkcore.createSparkRDD(sc, HBASE_STANDARD) data_rdd = Tsparkcore.calHbaseRDD(data_rdd, "EXAM_JSON") output_data_rdd = data_rdd.map(lambda kv: Tsparkcore.map_std_Row_test(kv[1], option_list)) schemaStd = sqlctx.createDataFrame(output_data_rdd, samplingRatio=0.2) schemaStd.registerTempTable("base_standard") stdDataFrame = sqlctx.sql( "SELECT codeID, organID, data_json, examTime, rowKey, examID FROM base_standard") cond = [crowdDataFrame.codeID == stdDataFrame.codeID, crowdDataFrame.organID == stdDataFrame.organID, crowdDataFrame.examID == stdDataFrame.examID] rsDataFrame = crowdDataFrame.join(stdDataFrame, cond, "inner").select(crowdDataFrame.codeID, crowdDataFrame.organID, crowdDataFrame.examID, stdDataFrame.examTime, stdDataFrame.data_json) rsDataFramePD= None # for key in group_all : # groups = group_all[key] # list = [] # k=0 # # for group in groups: # rsDataFrameG = get_df_group(group, rsDataFrame) # if rsDataFrameG is None : # continue # # rsDataFrameG = rsDataFrameG.withColumn(key + "_1", rsDataFrameG.data_json["390"]*0 + k) # if rsDataFramePD is None : # rsDataFramePD = rsDataFrameG # else : # rsDataFramePD = rsDataFramePD.unionAll(rsDataFrameG) # # print "k=", k # k = k + 1 # # print "rsDataFramePD==》", rsDataFramePD # list.append(rsDataFrameG) # get_mult_group(list, key) formula_list = [] rsDataFrameG1 = get_df_group(group_all["390"][0], rsDataFrame) rsDataFrameG2 = get_df_group(group_all["390"][1], rsDataFrame) rsDataFrameG3 = get_df_group(group_all["390"][2], rsDataFrame) frame = DataFrame() G1 = rsDataFrameG1.select(rsDataFrameG1.data_json["390"]).toPandas() G2 = rsDataFrameG2.select(rsDataFrameG2.data_json["390"]).toPandas() G3 = rsDataFrameG3.select(rsDataFrameG2.data_json["390"]).toPandas() frame['D'] = G1 frame_Arrs = [] frame_Arrs.append(G2) frame_Arrs.append(G3) i=0 for key in F_Arrs: i = i+1; formula_list.append('F'+str(i)) frame['F'+str(i)] = frame_Arrs[i-1] formula = "D~ " + "+".join(formula_list) print 'formula', formula anova_results = anova_lm(ols(formula, frame).fit()) result = {} import json result['1'] = anova_results.to_json() print "anova_results", anova_results sc.stop() return json.dumps(result, ensure_ascii=False)
def calc_python_spark(ask_parm, crowd_id): ask_parm['D'] = '9' ask_parm['F'] = '390,9' ask_parm['A'] = '390,9' ask_parm['C'] = '390,9' ask_parm['WLS'] = '390,9' D = ask_parm['D'] F_Arrs = ask_parm['F'].split(',') option_dict, option_list = utils.option_dict_list(ask_parm['optionList']) print "option_dict", option_dict sc = Tsparkcore.createSparkContext("crowd_ANOVA_TWO_WAY") sqlctx = SQLContext(sc) hbase_rdd = Tsparkcore.createSparkRDD(sc, HBASE_CROWD) hbase_rdd = Tsparkcore.calHbaseRDD(hbase_rdd, "EXAM_LIST", crowdID=crowd_id) output_hbase_rdd = hbase_rdd.map( lambda kv: Tsparkcore.map_crowd_Row(kv[1])) schemaCrowd = sqlctx.createDataFrame(output_hbase_rdd, samplingRatio=0.2) schemaCrowd.registerTempTable("statics_crowd_details") crowdDataFrame = sqlctx.sql( "SELECT codeID, organID, examID FROM statics_crowd_details") data_rdd = Tsparkcore.createSparkRDD(sc, HBASE_STANDARD) data_rdd = Tsparkcore.calHbaseRDD(data_rdd, "EXAM_JSON") output_data_rdd = data_rdd.map( lambda kv: Tsparkcore.map_std_Row_Two_Way_anova(kv[1], option_list)) schemaStd = sqlctx.createDataFrame(output_data_rdd, samplingRatio=0.2) schemaStd.registerTempTable("base_standard") stdDataFrame = sqlctx.sql( "SELECT codeID, organID, data_json, examTime, rowKey, examID, has_Type FROM base_standard" ).filter('has_Type=True') # print stdDataFrame.show() cond = [ crowdDataFrame.codeID == stdDataFrame.codeID, crowdDataFrame.organID == stdDataFrame.organID, crowdDataFrame.examID == stdDataFrame.examID ] rsDataFrame = crowdDataFrame.join(stdDataFrame, cond, "inner").select( crowdDataFrame.codeID, crowdDataFrame.organID, crowdDataFrame.examID, stdDataFrame.examTime, stdDataFrame.data_json) list = [] i = 0 list.append(rsDataFrame.data_json[D].alias('D')) formula_list = [] for key in F_Arrs: i = i + 1 formula_list.append('F' + str(i)) list.append(rsDataFrame.data_json[key].alias('F' + str(i))) df_D = rsDataFrame.select(list).toPandas() print 'df_D', df_D formula = "D~ " + "+".join(formula_list) print 'formula', formula anova_results = anova_lm(ols(formula, df_D).fit()) print anova_results result = {} import json result['1'] = anova_results sc.stop() return json.dumps(result, ensure_ascii=False)
def calc_python_spark(ask_parm, crowd_id): group1 = ask_parm["group1"] group2 = ask_parm["group2"] print 'group1', group1, ', group2', group2 print "parmId", type(group1['parmId']) print "parmVal", type(group1['parmVal']) print "parmType", type(group1['parmType']) print "parmId", type(group2['parmId']) print "parmVal", type(group2['parmVal']) print "parmType", type(group2['parmType']) option_dict, option_list = utils.option_dict_list(ask_parm['optionList']) print "option_dict", option_dict sc = Tsparkcore.createSparkContext("crowd_Pair_test") sqlctx = SQLContext(sc) hbase_rdd = Tsparkcore.createSparkRDD(sc, HBASE_CROWD) hbase_rdd = Tsparkcore.calHbaseRDD(hbase_rdd, "EXAM_LIST", crowdID=crowd_id) output_hbase_rdd = hbase_rdd.map(lambda kv: Tsparkcore.map_crowd_Row(kv[1])) schemaCrowd = sqlctx.createDataFrame(output_hbase_rdd, samplingRatio=0.2) schemaCrowd.registerTempTable("statics_crowd_details") crowdDataFrame = sqlctx.sql("SELECT codeID, organID, examID FROM statics_crowd_details") data_rdd = Tsparkcore.createSparkRDD(sc, HBASE_STANDARD) data_rdd = Tsparkcore.calHbaseRDD(data_rdd, "EXAM_JSON") output_data_rdd = data_rdd.map(lambda kv: Tsparkcore.map_std_Row_test(kv[1], option_list)) schemaStd = sqlctx.createDataFrame(output_data_rdd, samplingRatio=0.2) schemaStd.registerTempTable("base_standard") stdDataFrame = sqlctx.sql( "SELECT codeID, organID, examID, examTime, data_json FROM base_standard") cond = [crowdDataFrame.codeID == stdDataFrame.codeID, crowdDataFrame.organID == stdDataFrame.organID, crowdDataFrame.examID == stdDataFrame.examID] rsDataFrame = crowdDataFrame.join(stdDataFrame, cond, "inner").select(crowdDataFrame.codeID, crowdDataFrame.organID, crowdDataFrame.examID, stdDataFrame.examTime, stdDataFrame.data_json ) rsDataFrameG1 = get_df_group(group1, rsDataFrame) rsDataFrameG2 = get_df_group(group2, rsDataFrame) result = {} for colKey in option_list: print "colKey", colKey name = option_dict[colKey] print "name =>", name, ", colKey=>"+colKey G1 = rsDataFrameG1.select(rsDataFrameG1.data_json["390"]).toPandas() G2 = rsDataFrameG2.select(rsDataFrameG2.data_json["390"]).toPandas() result[name] = get_result_arrs(G1, G2) import json print result sc.stop() return json.dumps(result, ensure_ascii=False)