def packageRcomItemCF(req_user, datetime): ''' 对推荐试题结果程序进行封装打包 @param req_user 原始数据文件(user_id, province, question) @param datetime 时间区间 ''' prov_set = getProvinceSet() filename = USER_PATH + output_file.format(datetime, time.strftime("%Y%m%d")) if os.path.exists(filename): os.remove(filename) with open(USER_PATH + req_user, 'r', encoding='gbk') as user_file: readers = csv.DictReader(user_file) for reader in readers: prov = reader['province'][:2] if prov not in prov_set: prov = '全国' user_id = reader['user_id'] question_id = eval(reader['question']) kn_kpo = evalSubKpo( mode=0, data=question_id, table='question_simhash_20171111' ) if len(kn_kpo) > 0: subj_dic = subjDic(kn_kpo) for ks, vs in subj_dic.items(): ana_icf = ANA_ICF_PATH+ datetime + '/' + prov + '/' + ana_iCF_file.format(prov,ks, datetime) ana_fpg = ANA_FPG_PATH + datetime + '/' + prov + '/' + ana_fpg_file.format(prov, ks, datetime) eval_icf = EVA_ICF_PATH + datetime + '/' + prov + '/' + eva_iCF_file.format(prov,ks, datetime) eval_fpg = EVA_FPG_PATH + datetime + '/' + prov + '/' + eva_fpg_file.format(prov,ks, datetime) recom_set = getRecomSet(ana_icf, ana_fpg, vs, user_id) eval_set = getEvalSet(eval_icf, eval_fpg, vs, user_id) if len(recom_set) > 0 and len(eval_set) > 0: Sprecision, Srecall = EvaluatSubKpoint(recom_set, eval_set, ks) Kprecision, Krecall = EvaluatKnowPoint(recom_set, eval_set, ks) print(Sprecision, Srecall) print(Kprecision, Krecall) print("# # # " * 20) else: logging.error("至少有一个路径下没有文件,可能是datetimes、prov、subj的问题!") logging.info("已经评测完用户{0}年级科目{1}!".format(user_id, ks)) else: logging.error( "用户{}传入的question_id在表question_simhash_20171111里查询不到!".format(user_id)) user_file.close()
def packageKSimhash(requir_user_file, thsold): ''' 对prov_sub数据通过user作为质点,进行降维 @param requir_user_file 原始数据文件(user_id, province, question) @param thsold 阈值,如果小于此数值即为有用数值,int ''' prov_set = getProvinceSet() with open(USER_PATH + requir_user_file, 'r') as user_file: readers = csv.DictReader(user_file) for reader in readers: prov = reader['province'][:2] if prov not in prov_set: prov = '全国' user_id = reader['user_id'] question_id = eval(reader['question']) question_list = tableToJson(table='question_simhash_20171111', question_id=question_id) if len(question_list) > 0: subj_dic = {} recom_set = set([]) for sub_kpoint in question_list: if str(sub_kpoint[1]) not in subj_dic.keys(): subj_dic[str(sub_kpoint[1])] = [sub_kpoint[0]] else: subj_dic[str(sub_kpoint[1])].append(sub_kpoint[0]) for ks, vs in subj_dic.items(): ps_file = PROV_SUB_PATH + datetime + '/' + prov + '/' + prov_subj_file.format( prov, ks, datetime) if os.path.exists(ps_file): PATH = K_SIMH_PATH + datetime + '/' + prov mkdir(PATH) if os.path.exists( PATH + '/' + output_file.format(prov, ks, datetime)): os.remove(PATH + '/' + output_file.format(prov, ks, datetime)) k_simh_list = [] with open(ps_file, 'r') as ksimhash_file: while True: ksimhash = ksimhash_file.readline() if ksimhash: sim_dis = K_Simhash( ksimhash=eval(ksimhash), vs=vs) if sim_dis < thsold: with open( PATH + '/' + output_file.format( prov, ks, datetime), 'a') as txt_file: txt_file.writelines( json.dumps(eval(ksimhash))) txt_file.write('\n') else: break ksimhash_file.close() else: print(u"没有在Prov_Sub_input中查询到生成的(省份-学科)文件!")
def packageRcomItemCF(requir_user_file, diff, output_file): ''' 对推荐试题结果程序进行封装打包 @param requir_user_file 原始数据文件(user_id, province, question) @param diff 难度系数,int @param output_file 输出文件名 ''' prov_set = getProvinceSet() diff = diff or 1 filename = USER_PATH + output_file.format(datetime, time.strftime("%Y%m%d")) if os.path.exists(filename): os.remove(filename) with open(USER_PATH + requir_user_file, 'r', encoding='gbk') as user_file: readers = csv.DictReader(user_file) for reader in readers: prov = reader['province'][:2] if prov not in prov_set: prov = '全国' user_id = reader['user_id'] question_id = eval(reader['question']) question_list = tableToJson(table='question_simhash_20171111', question_id=question_id) if len(question_list) > 0: subj_dic = {} recom_set = set([]) for sub_kpoint in question_list: if str(sub_kpoint[1]) not in subj_dic.keys(): subj_dic[str(sub_kpoint[1])] = [sub_kpoint[0]] else: subj_dic[str(sub_kpoint[1])].append(sub_kpoint[0]) for ks, vs in subj_dic.items(): itemcf_file = ITEMCF_PATH + datetime + '/' + prov + '/' + itemCF_file.format( prov, ks, datetime) fpg_file = FPGTH_PATH + datetime + '/' + prov + '/' + fpgth_output_file.format( prov, ks, datetime) if os.path.exists(itemcf_file) or os.path.exists(fpg_file): logging.info( u"正在读取{0}省份年級学科{1}下的Analy下的ItemCF数据!".format( prov, ks)) recom_set_itemCF = set([]) recom_set_fpg = set([]) with open(itemcf_file, 'r') as recom_file: while True: recom = recom_file.readline() if recom: if list(eval(recom).keys())[0] in vs: for j, wj in sorted(eval(recom)[list( eval( recom).keys())[0]].items(), key=itemgetter(1), reverse=True)[:2]: if j in vs: continue recom_set_itemCF.add(j) else: break recom_file.close() logging.info( u"正在读取{0}省份年级学科{1}下的Analy下的FPGrowth数据!".format( prov, ks)) with open(fpg_file, 'r') as recom_file: while True: result = recom_file.readline() if result: recom_set_fpg.add( getRecomFPGth( result.replace('\n', '').split('--'), set(vs))) # print(recom_set) else: break #结合item_cf和FPGrowth两种推荐算法结果进行推荐 if len(recom_set_itemCF) == 0 or len( recom_set_fpg) == 0: recom_set = recom_set_fpg or recom_set_itemCF if len(recom_set) == 0: logging.info( u"用户{0},在年级科目{1}下没有通过FPGrowth和ItemCF得到推荐结果!" .format(user_id, ks)) recom_set = set(vs) else: recom_set = recom_set_fpg & recom_set_itemCF if len(recom_set) <= len(set(vs)): logging.info( u"用户{0},在年级科目{1}下,取并集没有达到一定数量!".format( user_id, ks)) recom_set = recom_set_itemCF or recom_set_fpg recom_list = getQuestionId( table='question_simhash_20171111', question_id=question_id, recom_set=recom_set, diff=diff) with open(filename, 'a') as csvfile: writer = csv.writer(csvfile) writer.writerow([user_id, ks, recom_list]) logging.info(u"已经解析完用户{}!".format(user_id)) else: logging.error( u"用户{0}在年级科目{1}下没有查询到ItemCF和FPGrowth输出表".format( user_id, ks)) else: logging.error( u"用户{}传入的question_id在表question_simhash_20171111里查询不到!". format(user_id)) user_file.close()
def packageRcomApri(requir_user_file, diff, output_file): ''' 对推荐试题结果程序进行封装打包 @param requir_user_file 原始数据文件(user_id, province, question) @param diff 难度系数,int @param output_file 输出文件名 ''' prov_set = getProvinceSet() diff = diff or 1 # print(os.path.getsize()) with open(USER_PATH + requir_user_file, 'r') as user_file: readers = csv.DictReader(user_file) for reader in readers: prov = reader['province'][:2] if prov not in prov_set: prov = '全国' user_id = reader['user_id'] question_id = eval(reader['question']) question_list = tableToJson( table='question_simhash_20171111', question_id=question_id ) if len(question_list) > 0: subj_dic = {} recom_set = set([]) for sub_kpoint in question_list: if str(sub_kpoint[1]) not in subj_dic.keys(): subj_dic[str(sub_kpoint[1])] = [sub_kpoint[0]] else: subj_dic[str(sub_kpoint[1])].append(sub_kpoint[0]) for ks, vs in subj_dic.items(): apri_file = APRIORI_PATH + datetime + '/' + prov + '/' + apriori_output_file.format(prov, ks, datetime) if os.path.exists(apri_file): logging.info(u"正在读取{0}省份下{1}学科的Analy下的Apriori数据!".format(prov, ks)) with open(apri_file, 'r') as recom_file: results = csv.DictReader(recom_file) for result in results: recom_set.add(getRecomApri(result[''].split('--'), set(vs))) recom_file.close() else: print(u"没有查询到生成相关的Apriori输出表!") del subj_dic recom_list = getQuestionId( table='question_simhash_20171111', question_id=question_id, recom_set=recom_set, diff=diff ) with open(RECOM_APRI_PATH + output_file.format(datetime, time.strftime("%Y%m%d")), 'a') as csvfile: writer = csv.writer(csvfile) writer.writerow([user_id, recom_list]) logging.info(u"已经解析完{0}省份下{1}学科的数据,并存入到Prov_Sub_input文件!".format(prov)) else: print(u"用户{}传入的question_id在表**question_simhash_20171111**里查询不到!".format(user_id)) user_file.close()
for subj in sub_dic.keys(): if len(list(sub_dic.values())[0]) > 3: with open(P_S_PATH + '/' + prov_subj_file.format(prov, subj, datetime), 'a') as new_file: new_file.writelines(json.dumps(list(sub_dic.values())[0]) + '\n') else: break logging.info(u"已经解析完{0}省份下{1}学科的数据,并存入到Prov_Sub_input文件!".format(prov, subj)) new_file.close() sub_kpoint_file.close() if __name__ == '__main__': prov_set = getProvinceSet() # prov_set = {'青海'} subj_set = {str(j) for j in range(1, 11)} | {str(j) for j in range(21, 31)} | {str(j) for j in range(41, 51)} pool = Pool(3) for prov in prov_set: P_S_PATH = PROV_SUB_PATH + datetime + '/' + prov mkdir(P_S_PATH) logging.info("the classify the subject ") for subj in subj_set: logging.info(u"正在读取{0}省份下{1}学科的Sub_kpoint_input文件".format(prov, subj)) pool.apply_async(packClassSub, kwds={ "prov":prov, "subj":subj,
def packageRcomFPGth(requir_user_file, output_file, diff): ''' 对推荐试题结果程序进行封装打包 @param requir_user_file 原始数据文件(user_id, province, question) @param diff 难度系数,int @param output_file 输出文件名 ''' prov_set = getProvinceSet() diff = diff or 1 filename = RECOM_FPGTH_PATH + output_file.format(datetime, time.strftime("%Y%m%d")) if os.path.exists(filename): os.remove(filename) with open(USER_PATH + requir_user_file, 'r') as user_file: readers = csv.DictReader(user_file) for reader in readers: prov = reader['province'][:2] if prov not in prov_set: prov = '全国' user_id = reader['user_id'] question_id = eval(reader['question']) question_list = tableToJson(table='question_simhash_20171111', question_id=question_id) if len(question_list) > 0: subj_dic = {} recom_set = set([]) for sub_kpoint in question_list: if str(sub_kpoint[1]) not in subj_dic.keys(): subj_dic[str(sub_kpoint[1])] = [sub_kpoint[0]] else: subj_dic[str(sub_kpoint[1])].append(sub_kpoint[0]) for ks, vs in subj_dic.items(): fpg_file = FPGTH_PATH + datetime + '/' + prov + '/' + fpgth_output_file.format( prov, ks, datetime) if os.path.exists(fpg_file): logging.info( u"正在读取{0}省份下{1}学科的Analy下的FPGrowth数据!".format( prov, ks)) with open(fpg_file, 'r') as recom_file: while True: result = recom_file.readline() if result: recom_set.add( getRecomFPGth( result.replace('\n', '').split('--'), set(vs))) else: break recom_list = getQuestionId( table='question_simhash_20171111', question_id=question_id, recom_set=recom_set, diff=diff) with open(filename, 'a') as csvfile: writer = csv.writer(csvfile) writer.writerow([user_id, ks, recom_list]) logging.info(u"已经解析完用户{}!".format(user_id)) else: print(u"没有查询到生成相关的FPGrowth输出表!") else: print( u"用户{}传入的question_id在表**question_simhash_20171111**里查询不到!". format(user_id)) user_file.close()
def packageRcomItemCF(requir_user_file, diff, output_file): ''' 对推荐试题结果程序进行封装打包 @param requir_user_file 原始数据文件(user_id, province, question) @param diff 难度系数,int @param output_file 输出文件名 ''' prov_set = getProvinceSet() diff = diff or 1 filename = RECOM_ITEMCF_PATH + output_file.format(datetime, time.strftime("%Y%m%d")) if os.path.exists(filename): os.remove(filename) with open(USER_PATH + requir_user_file, 'r') as user_file: readers = csv.DictReader(user_file) for reader in readers: prov = reader['province'][:2] if prov not in prov_set: prov = '全国' user_id = reader['user_id'] question_id = eval(reader['question']) question_list = tableToJson(table='question_simhash_20171111', question_id=question_id) if len(question_list) > 0: subj_dic = {} recom_set = set([]) for sub_kpoint in question_list: if str(sub_kpoint[1]) not in subj_dic.keys(): subj_dic[str(sub_kpoint[1])] = [sub_kpoint[0]] else: subj_dic[str(sub_kpoint[1])].append(sub_kpoint[0]) for ks, vs in subj_dic.items(): itemcf_file = ITEMCF_PATH + datetime + '/' + prov + '/' + itemCF_file.format( prov, ks, datetime) if os.path.exists(itemcf_file): logging.info( u"正在读取{0}省份下{1}学科的Analy下的ItemCF数据!".format( prov, ks)) recom_set = set([]) with open(itemcf_file, 'r') as recom_file: while True: recom = recom_file.readline() if recom: if list(eval(recom).keys())[0] in vs: for j, wj in sorted(eval(recom)[list( eval( recom).keys())[0]].items(), key=itemgetter(1), reverse=True)[:2]: if j in vs: continue recom_set.add(j) else: break recom_file.close() if len(recom_set) == 0: recom_set = set(vs) recom_list = getQuestionId( table='question_simhash_20171111', question_id=question_id, recom_set=recom_set, diff=diff) with open(filename, 'a') as csvfile: writer = csv.writer(csvfile) writer.writerow([user_id, ks, recom_list]) logging.info(u"已经解析完用户{}!".format(user_id)) else: print(u"没有查询到生成相关的ItemCF输出表!") else: print( u"用户{}传入的question_id在表**question_simhash_20171111**里查询不到!". format(user_id)) user_file.close()