def packageRcomItemCF(req_user, datetime):
    ''' 对推荐试题结果程序进行封装打包
                                @param req_user     原始数据文件(user_id, province, question)
                                @param datetime     时间区间
                            '''
    prov_set = getProvinceSet()

    filename = USER_PATH + output_file.format(datetime, time.strftime("%Y%m%d"))
    if os.path.exists(filename):
        os.remove(filename)

    with open(USER_PATH + req_user, 'r', encoding='gbk') as user_file:
        readers = csv.DictReader(user_file)
        for reader in readers:
            prov = reader['province'][:2]
            if prov not in prov_set:
                prov = '全国'

            user_id = reader['user_id']
            question_id = eval(reader['question'])

            kn_kpo = evalSubKpo(
                mode=0,
                data=question_id,
                table='question_simhash_20171111'
            )

            if len(kn_kpo) > 0:
                subj_dic = subjDic(kn_kpo)

                for ks, vs in subj_dic.items():
                    ana_icf = ANA_ICF_PATH+ datetime + '/' + prov + '/' + ana_iCF_file.format(prov,ks, datetime)
                    ana_fpg = ANA_FPG_PATH + datetime + '/' + prov + '/' + ana_fpg_file.format(prov, ks, datetime)
                    eval_icf = EVA_ICF_PATH + datetime + '/' + prov + '/' + eva_iCF_file.format(prov,ks, datetime)
                    eval_fpg = EVA_FPG_PATH + datetime + '/' + prov + '/' + eva_fpg_file.format(prov,ks, datetime)

                    recom_set = getRecomSet(ana_icf, ana_fpg, vs, user_id)
                    eval_set = getEvalSet(eval_icf, eval_fpg, vs, user_id)

                    if len(recom_set) > 0 and len(eval_set) > 0:
                        Sprecision, Srecall = EvaluatSubKpoint(recom_set, eval_set, ks)
                        Kprecision, Krecall = EvaluatKnowPoint(recom_set, eval_set, ks)
                        print(Sprecision, Srecall)
                        print(Kprecision, Krecall)
                        print("# # # " * 20)

                    else:
                        logging.error("至少有一个路径下没有文件,可能是datetimes、prov、subj的问题!")

                    logging.info("已经评测完用户{0}年级科目{1}!".format(user_id, ks))

            else:
                logging.error(
                    "用户{}传入的question_id在表question_simhash_20171111里查询不到!".format(user_id))

        user_file.close()
예제 #2
0
def packageKSimhash(requir_user_file, thsold):
    ''' 对prov_sub数据通过user作为质点,进行降维
                                    @param requir_user_file     原始数据文件(user_id, province, question)
                                    @param thsold               阈值,如果小于此数值即为有用数值,int
                                '''
    prov_set = getProvinceSet()
    with open(USER_PATH + requir_user_file, 'r') as user_file:
        readers = csv.DictReader(user_file)
        for reader in readers:
            prov = reader['province'][:2]
            if prov not in prov_set:
                prov = '全国'

            user_id = reader['user_id']
            question_id = eval(reader['question'])

            question_list = tableToJson(table='question_simhash_20171111',
                                        question_id=question_id)

            if len(question_list) > 0:
                subj_dic = {}
                recom_set = set([])

                for sub_kpoint in question_list:
                    if str(sub_kpoint[1]) not in subj_dic.keys():
                        subj_dic[str(sub_kpoint[1])] = [sub_kpoint[0]]
                    else:
                        subj_dic[str(sub_kpoint[1])].append(sub_kpoint[0])

                for ks, vs in subj_dic.items():
                    ps_file = PROV_SUB_PATH + datetime + '/' + prov + '/' + prov_subj_file.format(
                        prov, ks, datetime)

                    if os.path.exists(ps_file):
                        PATH = K_SIMH_PATH + datetime + '/' + prov
                        mkdir(PATH)

                        if os.path.exists(
                                PATH + '/' +
                                output_file.format(prov, ks, datetime)):
                            os.remove(PATH + '/' +
                                      output_file.format(prov, ks, datetime))
                        k_simh_list = []
                        with open(ps_file, 'r') as ksimhash_file:
                            while True:
                                ksimhash = ksimhash_file.readline()
                                if ksimhash:
                                    sim_dis = K_Simhash(
                                        ksimhash=eval(ksimhash), vs=vs)

                                    if sim_dis < thsold:
                                        with open(
                                                PATH + '/' +
                                                output_file.format(
                                                    prov, ks, datetime),
                                                'a') as txt_file:
                                            txt_file.writelines(
                                                json.dumps(eval(ksimhash)))
                                            txt_file.write('\n')

                                else:
                                    break
                            ksimhash_file.close()

                    else:
                        print(u"没有在Prov_Sub_input中查询到生成的(省份-学科)文件!")
예제 #3
0
def packageRcomItemCF(requir_user_file, diff, output_file):
    ''' 对推荐试题结果程序进行封装打包
                                @param requir_user_file     原始数据文件(user_id, province, question)
                                @param diff                 难度系数,int
                                @param output_file          输出文件名
                            '''
    prov_set = getProvinceSet()
    diff = diff or 1

    filename = USER_PATH + output_file.format(datetime,
                                              time.strftime("%Y%m%d"))
    if os.path.exists(filename):
        os.remove(filename)

    with open(USER_PATH + requir_user_file, 'r', encoding='gbk') as user_file:
        readers = csv.DictReader(user_file)
        for reader in readers:
            prov = reader['province'][:2]
            if prov not in prov_set:
                prov = '全国'

            user_id = reader['user_id']
            question_id = eval(reader['question'])

            question_list = tableToJson(table='question_simhash_20171111',
                                        question_id=question_id)

            if len(question_list) > 0:
                subj_dic = {}
                recom_set = set([])

                for sub_kpoint in question_list:
                    if str(sub_kpoint[1]) not in subj_dic.keys():
                        subj_dic[str(sub_kpoint[1])] = [sub_kpoint[0]]
                    else:
                        subj_dic[str(sub_kpoint[1])].append(sub_kpoint[0])

                for ks, vs in subj_dic.items():
                    itemcf_file = ITEMCF_PATH + datetime + '/' + prov + '/' + itemCF_file.format(
                        prov, ks, datetime)
                    fpg_file = FPGTH_PATH + datetime + '/' + prov + '/' + fpgth_output_file.format(
                        prov, ks, datetime)
                    if os.path.exists(itemcf_file) or os.path.exists(fpg_file):
                        logging.info(
                            u"正在读取{0}省份年級学科{1}下的Analy下的ItemCF数据!".format(
                                prov, ks))
                        recom_set_itemCF = set([])
                        recom_set_fpg = set([])

                        with open(itemcf_file, 'r') as recom_file:
                            while True:
                                recom = recom_file.readline()
                                if recom:
                                    if list(eval(recom).keys())[0] in vs:

                                        for j, wj in sorted(eval(recom)[list(
                                                eval(
                                                    recom).keys())[0]].items(),
                                                            key=itemgetter(1),
                                                            reverse=True)[:2]:

                                            if j in vs:
                                                continue
                                            recom_set_itemCF.add(j)

                                else:
                                    break
                            recom_file.close()

                        logging.info(
                            u"正在读取{0}省份年级学科{1}下的Analy下的FPGrowth数据!".format(
                                prov, ks))
                        with open(fpg_file, 'r') as recom_file:
                            while True:
                                result = recom_file.readline()
                                if result:
                                    recom_set_fpg.add(
                                        getRecomFPGth(
                                            result.replace('\n',
                                                           '').split('--'),
                                            set(vs)))
                                    # print(recom_set)
                                else:
                                    break

                        #结合item_cf和FPGrowth两种推荐算法结果进行推荐
                        if len(recom_set_itemCF) == 0 or len(
                                recom_set_fpg) == 0:
                            recom_set = recom_set_fpg or recom_set_itemCF

                            if len(recom_set) == 0:
                                logging.info(
                                    u"用户{0},在年级科目{1}下没有通过FPGrowth和ItemCF得到推荐结果!"
                                    .format(user_id, ks))
                                recom_set = set(vs)

                        else:
                            recom_set = recom_set_fpg & recom_set_itemCF

                            if len(recom_set) <= len(set(vs)):
                                logging.info(
                                    u"用户{0},在年级科目{1}下,取并集没有达到一定数量!".format(
                                        user_id, ks))
                                recom_set = recom_set_itemCF or recom_set_fpg

                        recom_list = getQuestionId(
                            table='question_simhash_20171111',
                            question_id=question_id,
                            recom_set=recom_set,
                            diff=diff)

                        with open(filename, 'a') as csvfile:
                            writer = csv.writer(csvfile)
                            writer.writerow([user_id, ks, recom_list])

                        logging.info(u"已经解析完用户{}!".format(user_id))

                    else:
                        logging.error(
                            u"用户{0}在年级科目{1}下没有查询到ItemCF和FPGrowth输出表".format(
                                user_id, ks))

            else:
                logging.error(
                    u"用户{}传入的question_id在表question_simhash_20171111里查询不到!".
                    format(user_id))

        user_file.close()
예제 #4
0
def packageRcomApri(requir_user_file, diff, output_file):
    ''' 对推荐试题结果程序进行封装打包
                            @param requir_user_file     原始数据文件(user_id, province, question)
                            @param diff                 难度系数,int
                            @param output_file          输出文件名
                        '''
    prov_set = getProvinceSet()
    diff = diff or 1
    # print(os.path.getsize())
    with open(USER_PATH + requir_user_file, 'r') as user_file:
        readers = csv.DictReader(user_file)
        for reader in readers:
            prov = reader['province'][:2]
            if prov not in prov_set:
                prov = '全国'

            user_id = reader['user_id']
            question_id = eval(reader['question'])

            question_list = tableToJson(
                table='question_simhash_20171111',
                question_id=question_id
            )

            if len(question_list) > 0:
                subj_dic = {}
                recom_set = set([])

                for sub_kpoint in question_list:
                    if str(sub_kpoint[1]) not in subj_dic.keys():
                        subj_dic[str(sub_kpoint[1])] = [sub_kpoint[0]]
                    else:
                        subj_dic[str(sub_kpoint[1])].append(sub_kpoint[0])

                for ks, vs in subj_dic.items():
                    apri_file = APRIORI_PATH + datetime + '/' + prov + '/' + apriori_output_file.format(prov, ks,
                                                                                                        datetime)
                    if os.path.exists(apri_file):
                        logging.info(u"正在读取{0}省份下{1}学科的Analy下的Apriori数据!".format(prov, ks))

                        with open(apri_file, 'r') as recom_file:
                            results = csv.DictReader(recom_file)
                            for result in results:
                                recom_set.add(getRecomApri(result[''].split('--'), set(vs)))
                            recom_file.close()
                    else:
                        print(u"没有查询到生成相关的Apriori输出表!")

                del subj_dic
                recom_list = getQuestionId(
                    table='question_simhash_20171111',
                    question_id=question_id,
                    recom_set=recom_set,
                    diff=diff
                )

                with open(RECOM_APRI_PATH + output_file.format(datetime, time.strftime("%Y%m%d")), 'a') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow([user_id, recom_list])

                logging.info(u"已经解析完{0}省份下{1}学科的数据,并存入到Prov_Sub_input文件!".format(prov))

            else:
                print(u"用户{}传入的question_id在表**question_simhash_20171111**里查询不到!".format(user_id))

        user_file.close()
예제 #5
0
                for subj in sub_dic.keys():
                    if len(list(sub_dic.values())[0]) > 3:
                        with open(P_S_PATH + '/' + prov_subj_file.format(prov, subj, datetime), 'a') as new_file:
                            new_file.writelines(json.dumps(list(sub_dic.values())[0]) + '\n')

            else:
                break

        logging.info(u"已经解析完{0}省份下{1}学科的数据,并存入到Prov_Sub_input文件!".format(prov, subj))
        new_file.close()
        sub_kpoint_file.close()


if __name__ == '__main__':
    prov_set = getProvinceSet()
    # prov_set = {'青海'}
    subj_set = {str(j) for j in range(1, 11)} | {str(j) for j in range(21, 31)} | {str(j) for j in range(41, 51)}
    pool = Pool(3)

    for prov in prov_set:
        P_S_PATH = PROV_SUB_PATH + datetime + '/' + prov
        mkdir(P_S_PATH)
        logging.info("the classify the subject ")

        for subj in subj_set:
            logging.info(u"正在读取{0}省份下{1}学科的Sub_kpoint_input文件".format(prov, subj))

            pool.apply_async(packClassSub, kwds={
                "prov":prov,
                "subj":subj,
예제 #6
0
def packageRcomFPGth(requir_user_file, output_file, diff):
    ''' 对推荐试题结果程序进行封装打包
                                @param requir_user_file     原始数据文件(user_id, province, question)
                                @param diff                 难度系数,int
                                @param output_file          输出文件名
                            '''
    prov_set = getProvinceSet()
    diff = diff or 1

    filename = RECOM_FPGTH_PATH + output_file.format(datetime,
                                                     time.strftime("%Y%m%d"))
    if os.path.exists(filename):
        os.remove(filename)

    with open(USER_PATH + requir_user_file, 'r') as user_file:
        readers = csv.DictReader(user_file)
        for reader in readers:
            prov = reader['province'][:2]
            if prov not in prov_set:
                prov = '全国'

            user_id = reader['user_id']
            question_id = eval(reader['question'])

            question_list = tableToJson(table='question_simhash_20171111',
                                        question_id=question_id)

            if len(question_list) > 0:
                subj_dic = {}
                recom_set = set([])

                for sub_kpoint in question_list:
                    if str(sub_kpoint[1]) not in subj_dic.keys():
                        subj_dic[str(sub_kpoint[1])] = [sub_kpoint[0]]
                    else:
                        subj_dic[str(sub_kpoint[1])].append(sub_kpoint[0])

                for ks, vs in subj_dic.items():
                    fpg_file = FPGTH_PATH + datetime + '/' + prov + '/' + fpgth_output_file.format(
                        prov, ks, datetime)
                    if os.path.exists(fpg_file):
                        logging.info(
                            u"正在读取{0}省份下{1}学科的Analy下的FPGrowth数据!".format(
                                prov, ks))
                        with open(fpg_file, 'r') as recom_file:
                            while True:
                                result = recom_file.readline()
                                if result:
                                    recom_set.add(
                                        getRecomFPGth(
                                            result.replace('\n',
                                                           '').split('--'),
                                            set(vs)))

                                else:
                                    break

                        recom_list = getQuestionId(
                            table='question_simhash_20171111',
                            question_id=question_id,
                            recom_set=recom_set,
                            diff=diff)

                        with open(filename, 'a') as csvfile:
                            writer = csv.writer(csvfile)
                            writer.writerow([user_id, ks, recom_list])

                        logging.info(u"已经解析完用户{}!".format(user_id))

                    else:
                        print(u"没有查询到生成相关的FPGrowth输出表!")

            else:
                print(
                    u"用户{}传入的question_id在表**question_simhash_20171111**里查询不到!".
                    format(user_id))

        user_file.close()
예제 #7
0
def packageRcomItemCF(requir_user_file, diff, output_file):
    ''' 对推荐试题结果程序进行封装打包
                                @param requir_user_file     原始数据文件(user_id, province, question)
                                @param diff                 难度系数,int
                                @param output_file          输出文件名
                            '''
    prov_set = getProvinceSet()
    diff = diff or 1

    filename = RECOM_ITEMCF_PATH + output_file.format(datetime,
                                                      time.strftime("%Y%m%d"))
    if os.path.exists(filename):
        os.remove(filename)

    with open(USER_PATH + requir_user_file, 'r') as user_file:
        readers = csv.DictReader(user_file)
        for reader in readers:
            prov = reader['province'][:2]
            if prov not in prov_set:
                prov = '全国'

            user_id = reader['user_id']
            question_id = eval(reader['question'])

            question_list = tableToJson(table='question_simhash_20171111',
                                        question_id=question_id)

            if len(question_list) > 0:
                subj_dic = {}
                recom_set = set([])

                for sub_kpoint in question_list:
                    if str(sub_kpoint[1]) not in subj_dic.keys():
                        subj_dic[str(sub_kpoint[1])] = [sub_kpoint[0]]
                    else:
                        subj_dic[str(sub_kpoint[1])].append(sub_kpoint[0])

                for ks, vs in subj_dic.items():
                    itemcf_file = ITEMCF_PATH + datetime + '/' + prov + '/' + itemCF_file.format(
                        prov, ks, datetime)
                    if os.path.exists(itemcf_file):
                        logging.info(
                            u"正在读取{0}省份下{1}学科的Analy下的ItemCF数据!".format(
                                prov, ks))
                        recom_set = set([])

                        with open(itemcf_file, 'r') as recom_file:
                            while True:
                                recom = recom_file.readline()
                                if recom:
                                    if list(eval(recom).keys())[0] in vs:
                                        for j, wj in sorted(eval(recom)[list(
                                                eval(
                                                    recom).keys())[0]].items(),
                                                            key=itemgetter(1),
                                                            reverse=True)[:2]:

                                            if j in vs:
                                                continue
                                            recom_set.add(j)

                                else:
                                    break
                            recom_file.close()

                        if len(recom_set) == 0:
                            recom_set = set(vs)

                        recom_list = getQuestionId(
                            table='question_simhash_20171111',
                            question_id=question_id,
                            recom_set=recom_set,
                            diff=diff)

                        with open(filename, 'a') as csvfile:
                            writer = csv.writer(csvfile)
                            writer.writerow([user_id, ks, recom_list])

                        logging.info(u"已经解析完用户{}!".format(user_id))

                    else:
                        print(u"没有查询到生成相关的ItemCF输出表!")

            else:
                print(
                    u"用户{}传入的question_id在表**question_simhash_20171111**里查询不到!".
                    format(user_id))

        user_file.close()