示例#1
0
def get_fshl():
    dbHandle = DatabaseIo()
    if not dbHandle:
        return None

    # sql_user = """SELECT id,sex FROM account5000"""
    sql_user = DataBaseQuery["dgl_user_info"]
    # sql_course = "select id , system_course_id ,course_name from course_info"
    # sql_course = """select id,classify_id from course5000"""
    sql_course = DataBaseQuery["feature_classify"]
    # sql_user = """select user_id from user_basic_info"""
    # sql_dr = """select * from course_dr5000"""
    sql_dr = DataBaseQuery["course_dr"]

    result_dr = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                               sql=sql_dr)
    result_course = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                   sql=sql_course)
    result_user = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                 sql=sql_user)
    print(result_dr)

    # 数据加载
    train_data = pd.DataFrame(list(result_dr))
    train_data.columns = ['user_id', 'item_id', 'rating']
    user_data = pd.DataFrame(list(result_user))
    item_data = pd.DataFrame(list(result_course))

    return train_data, user_data, item_data
示例#2
0
def get_bigraph():
    dbHandle = DatabaseIo()
    if not dbHandle:
        return None

    sql_user = DataBaseQuery["dgl_user_info"]

    sql_course = DataBaseQuery["feature_classify"]

    sql_classify = DataBaseQuery["classify_info"]

    result_course = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                   sql=sql_course)
    result_user = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                 sql=sql_user)
    result_classify = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                     sql=sql_classify)
    # 数据加载
    train_data = pd.read_csv('file_saved/toGcn.csv',
                             header=None,
                             names=['user_id', 'item_id', 'rating'])
    train_data.columns = ['user_id', 'item_id', 'rating']
    user_data = pd.DataFrame(list(result_user))
    item_data = pd.DataFrame(list(result_course))
    classify_data = pd.DataFrame(list(result_classify))
    classify_data.columns = [
        'id', 'course_name', 'classify_name', 'classify_id'
    ]

    return train_data, user_data, item_data, classify_data
示例#3
0
def getDataFromDB():
    dbHandle = DatabaseIo()
    if not dbHandle:
        return None

    # sql_dr = """SELECT * FROM course_dr5000_changed"""
    sql_dr = DataBaseQuery["online_course_dr_changed"]
    # sql_course = "select id , system_course_id ,course_name from course_info"
    # sql_course = """select id, name from course5000"""
    sql_course = DataBaseQuery["course_info"]
    # sql_user = """select user_id from user_basic_info"""
    # sql_user = """select id from account5000"""
    sql_user = DataBaseQuery["user_id"]

    result_dr = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                               sql=sql_dr)
    result_course = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                   sql=sql_course)
    dbHandle.changeCloseFlag()
    result_user = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                 sql=sql_user)

    print(
        "len(result_dr) = {}, len(result_user) = {},\
          len(result_course) = {}".format(len(result_dr), len(result_user),
                                          len(result_course)))

    return result_dr, result_course, result_user
示例#4
0
def loadData():
    if FSLflag == False:
        all_data = pd.read_csv(
            '../DGL/ml-100k/u.data',
            sep='\t',
            header=None,
            names=['user_id', 'item_id', 'rating', 'timestamp'])
        # test_data = pd.read_csv('../DGL/ml-100k/ua.test', sep='\t', header=None,
        #                         names=['user_id', 'item_id', 'rating', 'timestamp'])
        user_data = pd.read_csv('../DGL/ml-100k/u.user',
                                sep='|',
                                header=None,
                                encoding='latin1')
        item_data = pd.read_csv('../DGL/ml-100k/u.item',
                                sep='|',
                                header=None,
                                encoding='latin1')
        # test_data = test_data[test_data['user_id'].isin(train_data['user_id']) &
        #                       test_data['item_id'].isin(train_data['item_id'])]
        # u_data = user_data[[0,1,2,3,4]]
        # u_data.columns = ['user_id','age','gender','occupation','zip_code']
        # i_data = item_data
        # i_data.columns = ['item_id','title','release_date','video_release_date','IMDb_URL','unknown','Action','Adventure','Animation','Children',
        #                   'Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller',
        #                   'War','Western']

        return all_data, user_data, item_data

    else:
        dbHandle = DatabaseIo()
        if not dbHandle:
            return None
        sql_dr = DataBaseQuery["course_dr"]
        sql_course = DataBaseQuery["course_info"]
        sql_user = DataBaseQuery["user_id"]
        sql_classify = DataBaseQuery["classify_info"]
        result_dr = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                   sql=sql_dr)
        result_course = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                       sql=sql_course)
        result_classify = dbHandle.doSql(
            execType=DataBaseOperateType.SearchMany, sql=sql_classify)
        dbHandle.changeCloseFlag()
        result_user = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                     sql=sql_user)
        drList = formatDataByType(SetType.SetType_List, result_dr)
        all_data = pd.DataFrame(list(drList))
        all_data.columns = ['user_id', 'item_id', 'rating']
        user_data = pd.DataFrame(list(result_user))
        item_data = pd.DataFrame(list(result_course))

        classify_data = pd.DataFrame(list(result_classify))
        classify_data.columns = [
            'id', 'course_name', 'classify_name', 'classify_id'
        ]

        return all_data, user_data, item_data, classify_data
示例#5
0
def get_course_info_with_image():
    dbHandle = DatabaseIo()
    if not dbHandle:
        return None
    # sql_course = "select id, name, image, description from course5000"
    sql_course = DataBaseQuery["interface_image"]
    # sql_classify = "select id, course_name, classify_name from course_classify5000"
    sql_classify = DataBaseQuery["classify_info"]
    result_course = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                   sql=sql_course)
    result_course_classify = dbHandle.doSql(
        execType=DataBaseOperateType.SearchMany, sql=sql_classify)
    dbHandle.changeCloseFlag()
    courseList = formatDataByType(SetType.SetType_List, result_course)
    classifyList = formatDataByType(SetType.SetType_List,
                                    result_course_classify)
    return courseList, classifyList
示例#6
0
def high_score_courses():
    logging.warning(u"运行日志:获取高评分课程列表")

    dbHandle = DatabaseIo()
    if not dbHandle:
        return None

    sql_clean_highscorecourse = DataBaseQuery["highscore_clean_course"]
    #sql_clean_highscorecourse = 'truncate table  high_score_course;'

    sql_insert_highscorecourse = DataBaseQuery["highscore_insert_course"]
    #sql_insert_highscorecourse = "insert into high_score_course(course_id, score) values (%s, %s)"

    courses = get_user_course()
    course_score_sum = {}
    highscoreCourseList = list()
    for row in courses:
        if int(row[1]) not in course_score_sum:
            # course_score_sum[int(row[1])] = [int(row[3]), 1]
            course_score_sum[row[1]] = [int(row[3]), 1]
        else:
            # course_score_sum[int(row[1])] = [(course_score_sum[int(row[1])][0] + int(row[3])), (course_score_sum[int(row[1])][1] + 1)]
            course_score_sum[row[1]] = [
                (course_score_sum[row[1]][0] + int(row[3])),
                (course_score_sum[row[1]][1] + 1)
            ]
    course_score = {}
    for key in course_score_sum:
        course_score[key] = course_score_sum[key][0] / course_score_sum[key][1]
    result = sort_by_value(course_score)
    # print(result)
    for row in result:
        course_id, score = row
        highscoreCourseList.append(tuple([course_id, score]))

    insertTuple = tuple(highscoreCourseList)
    dbHandle.doSql(DataBaseOperateType.InsertOne, sql_clean_highscorecourse)
    dbHandle.changeCloseFlag()
    dbHandle.doSql(DataBaseOperateType.InsertMany, sql_insert_highscorecourse,
                   insertTuple)

    print("highscorecourse success")
    # result_dataframe = DataFrame(result)
    # # result_dataframe.to_csv('highScore.csv', index=None, header=None)
    # result_dataframe.to_csv('file_saved/highScore.csv', index=None, header=None)
    return result
def get_target_course_id(course_id):
    sql = 'SELECT DISTINCT id from course_classify5000 WHERE classify_id in (SELECT classify_id FROM course_classify5000 WHERE id=' + course_id + ')'

    dbHandle = DatabaseIo()

    course_ids = dbHandle.doSql(DataBaseOperateType.SearchMany, sql)

    return course_ids
def get_target_records(course_id_set):
    sql = 'SELECT * from account_course5000 WHERE course_id in ' + course_id_set

    dbHandle = DatabaseIo()

    records = dbHandle.doSql(DataBaseOperateType.SearchMany, sql)

    return records
示例#9
0
def popular_courses():
    logging.warning(u"运行日志:获取热门课程列表")

    dbHandle = DatabaseIo()
    if not dbHandle:
        return None

    sql_clean_popularcourse = DataBaseQuery["popular_clean_course"]
    #sql_clean_popularcourse = 'truncate table popular_course;'

    sql_insert_popularcourse = DataBaseQuery["popular_insert_course"]
    #sql_insert_popularcourse = "insert into popular_course(course_id, course_click_times) values (%s, %s)"

    courses = get_user_course()
    course_click_times = {}
    popularCourseList = list()
    for row in courses:
        # if int(row[1]) not in course_click_times:
        #     course_click_times[int(row[1])] = int(row[2])
        if row[1] not in course_click_times:
            course_click_times[row[1]] = int(row[2])
        else:
            # course_click_times[int(row[1])] = course_click_times[int(row[1])] + int(row[2])
            course_click_times[row[1]] = course_click_times[row[1]] + int(
                row[2])
    result = sort_by_value(course_click_times)
    # result_dataframe = DataFrame(result)
    # print(result_dataframe)
    for row in result:
        course_id, click_times = row
        popularCourseList.append(tuple([course_id, click_times]))

    insertTuple = tuple(popularCourseList)
    # print(insertTuple)
    dbHandle.doSql(DataBaseOperateType.InsertOne, sql_clean_popularcourse)
    dbHandle.changeCloseFlag()
    dbHandle.doSql(DataBaseOperateType.InsertMany, sql_insert_popularcourse,
                   insertTuple)
    print("popularcourse success")
    # result_dataframe = DataFrame(result)
    # # result_dataframe.to_csv('popular.csv', index=None, header=None)
    # result_dataframe.to_csv('file_saved/popular.csv', index=None, header=None)
    # print result_dataframe
    return result
示例#10
0
def updateCourseDr():
    logging.warning(u"运行日志:开始数据预处理")
    print("run coursePredeal...")

    # sql_select_course = '''select account_id,course_id,
    #         (CASE
    #         WHEN duration>10 THEN 1
    #         ELSE 0 END)as time,
    #         CASE collect_status
    #         WHEN 'YES' THEN 1 ELSE 0 END AS collect,
    #         CASE commit_status
    #         WHEN 'YES' THEN 1.5 ELSE 0 END AS commit_status,
    #         CASE
    #         WHEN score>50 THEN 1 ELSE 0 END AS score
    #         FROM account_course5000'''
    sql_select_course = DataBaseQuery["predeal_user_course"]
    # sql_insert_course_dr = '''INSERT INTO course_dr(user_id, course_index, recommend_value)
    #                     VALUES (%s, %s, %s)'''
    sql_insert_course_dr = DataBaseQuery["predeal_insert_course_dr"]
    # sql_clean_course_dr = 'truncate table course_dr;'
    sql_clean_course_dr = DataBaseQuery["predeal_clean_course_dr"]

    dbHandle = DatabaseIo()
    if not dbHandle:
        return None

    result = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                            sql=sql_select_course)

    userCourseList = list()
    for row in result:
        user_id, course_id, time, collect, commit, score = row
        count = (3 * time + 2 * commit + collect + score) / 8
        userCourseList.append(tuple([user_id, course_id, count]))

    insertTuple = tuple(userCourseList)

    dbHandle.doSql(DataBaseOperateType.InsertOne, sql_clean_course_dr)
    dbHandle.changeCloseFlag()
    dbHandle.doSql(DataBaseOperateType.InsertMany, sql_insert_course_dr,
                   insertTuple)
    print("coursePredeal success")
示例#11
0
def get_couse_info():
    # logging.warning(u"运行日志:获取课程信息")
    dbHandle = DatabaseIo()
    if not dbHandle:
        return None
    # sql_course = "select id, name from course5000"
    sql_course = DataBaseQuery["course_info"]
    result_course = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                   sql=sql_course)
    dbHandle.changeCloseFlag()
    courseList = formatDataByType(SetType.SetType_List, result_course)
    return courseList
示例#12
0
def get_user_course():
    # logging.warning(u"运行日志:获取用户与课程的交互数据")
    dbHandle = DatabaseIo()
    if not dbHandle:
        return None

    # sql_user_course = "select account_id, course_id, click_times, score from account_course5000"
    sql_user_course = DataBaseQuery["user_course"]
    result_user_course = dbHandle.doSql(
        execType=DataBaseOperateType.SearchMany, sql=sql_user_course)

    # sql_user_course_changed = "select account_id, course_id, click_times, score from account_course5000"
    sql_user_course_changed = DataBaseQuery["user_course_changed"]
    result_user_course_changed = dbHandle.doSql(
        execType=DataBaseOperateType.SearchMany, sql=sql_user_course_changed)
    dbHandle.changeCloseFlag()

    result_user_course = list(result_user_course) + list(
        result_user_course_changed)

    return result_user_course
示例#13
0
def get_all_users():
    # logging.warning(u"运行日志:获取所有用户的id")
    dbHandle = DatabaseIo()
    if not dbHandle:
        return None

    # sql_user = "******"
    sql_user = DataBaseQuery["user_id"]
    result_user = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                 sql=sql_user)
    dbHandle.changeCloseFlag()
    return result_user
示例#14
0
def get_couse_info_with_video():
    # logging.warning(u"运行日志:获取课程信息")
    dbHandle = DatabaseIo()
    if not dbHandle:
        return None
    # 先用图片替代模拟视频
    # sql_course = "select id, name, image, description from course5000"
    sql_course = DataBaseQuery["interface_video"]
    result_course = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                   sql=sql_course)
    dbHandle.changeCloseFlag()
    courseList = formatDataByType(SetType.SetType_List, result_course)
    return courseList
def getDataFromDB():
    dbHandle = DatabaseIo()
    if not dbHandle:
        return None

    sql_dr = """select *          from course_dr"""
    sql_course = "select id , system_course_id ,course_name from course_info"
    sql_user = """select user_id from user_basic_info"""

    result_dr = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                               sql=sql_dr)
    result_course = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                   sql=sql_course)
    dbHandle.changeCloseFlag()
    result_user = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                 sql=sql_user)

    print("len(result_dr) = {}, len(result_user) = {},\
          len(result_course) = {}".format(len(result_dr), len(result_user),
                                          len(result_course)))

    return result_dr, result_course, result_user
示例#16
0
def getChangedData(d):

    sql_select_course = '''select account_id,course_id
            FROM account_course5000
            WHERE duration > 10
            AND UNIX_TIMESTAMP(update_time) > UNIX_TIMESTAMP('{0}')
            '''.format(d)

    dbHandle = DatabaseIo()
    if not dbHandle:
        return None

    result = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                            sql=sql_select_course)
    df = pd.DataFrame(list(result), columns=['uid', 'cid'])
    return df
def get_original_course_id(threshold):
    m = datetime.datetime.now().month
    if m == 1:
        m = 12
    else:
        m -= threshold
    if m < 10:
        m = '0' + str(m)
    else:
        m = str(m)
    time_line = str(datetime.datetime.now().year) + '-' + str(m)

    sql = 'SELECT DISTINCT course_id FROM account_course5000 WHERE date_format(update_time,' \
          ' \'%Y-%m-%d %H:%i:%s\') LIKE \'' + time_line + '____________\''

    dbHandle = DatabaseIo()

    course_ids = dbHandle.doSql(DataBaseOperateType.SearchMany, sql)

    return course_ids
def differFusion(d):
    logging.warning(u"运行日志:在线模块类别特征融合")
    print("run differFusion...")
    # sql_select_differ = '''select course_differ
    #                                 FROM course_info
    #                                 WHERE id = '{0}'
    #                                 '''
    # sql_select_differ = '''select classify_id
    #                                     FROM course_classify5000
    #                                     WHERE id = '{0}'
    #                                     '''
    sql_select_differ = DataBaseQuery["online_select_differ"]
    dbHandle = DatabaseIo()
    if not dbHandle:
        return None

    changedData = getChangedData(d)
    group = changedData.groupby(['uid'])
    grouped_cData = pd.DataFrame({
        'cid': group['cid'].apply(list)
    }).reset_index()
    gclist = grouped_cData['cid'].values.tolist()
    ulist = grouped_cData['uid'].values.tolist()
    #增量数据中每个用户所选的课程的类别列表
    uclist = []
    for i in range(len(ulist)):
        temp = []
        temp.append(ulist[i])
        for j in range(len(gclist[i])):
            result = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                    sql=sql_select_differ.format(gclist[i][j]))
            # gclist[i][j] = int(transformCourseType(result[0][0]))
            gclist[i][j] = int(result[0][0])
        temp.append(list(set(gclist[i])))
        uclist.append(temp)

    # print uclist
    # allData = pd.read_csv('../resultToRoc.csv')
    allData = pd.read_csv('../file_saved/resultToRoc.csv')
    auid = allData['uid'].values.tolist()
    acid = allData['cid'].values.tolist()
    avalue = allData['score'].values.tolist()
    print uclist
    # 从全量数据和增量数据中匹配交互过相同类别课程的目标用户,并对该目标用户在全量数据中与增量数据中类别相匹配的课程的推荐值增加0.2
    # 如果对一个目标用户,增量数据里面有全量数据中没有交互过的类别,那不算它,只给交互过的类别增加0.2
    for i in range(len(uclist)):
        for j in range(len(auid)):
            if auid[j] == long(uclist[i][0]):
                result = dbHandle.doSql(
                    execType=DataBaseOperateType.SearchMany,
                    sql=sql_select_differ.format(acid[j]))
                # checkcid = int(transformCourseType(result[0][0]))
                checkcid = int(result[0][0])
                if checkcid in uclist[i][1]:
                    avalue[j] += 0.2
                    # print checkcid,acid[j],avalue[j]

    differData = pd.DataFrame({0: auid, 1: acid, 2: avalue})

    # differData.to_csv('differData.csv',index=None,header=None)
    differData.to_csv('../file_saved/differData.csv', index=None, header=None)
    print("differFusion success")
    return differData
示例#19
0
def updateCourseDrChanged(d):
    logging.warning(u"运行日志:在线模块数据处理")

    # sql_select_course = '''select user_id,course_id,
    #     (CASE
    #     WHEN learning_time>10 THEN 1
    #     ELSE 0 END)as time,
    #     CASE collect_status
    #     WHEN 'YES' THEN 1 ELSE 0 END AS collect,
    #     CASE commit_status
    #     WHEN 'YES' THEN 1.5 ELSE 0 END AS commit_status,
    #     CASE
    #     WHEN score>50 THEN 1 ELSE 0 END AS score
    #     FROM user_course_changed
    #     WHERE learning_time > 10
    #     AND UNIX_TIMESTAMP(update_time) > UNIX_TIMESTAMP('{0}')'''.format(d)
    #
    # sql_insert_course_dr_changed = '''INSERT INTO course_dr_changed(user_id, course_id, recommend_value)
    #                     VALUES (%s, %s, %s)'''
    # sql_clean_course_dr_changed = 'truncate table course_dr_changed;'

    # sql_select_course = '''select account_id,course_id,
    #         (CASE
    #         WHEN duration>10 THEN 1
    #         ELSE 0 END)as time,
    #         CASE collect_status
    #         WHEN 'YES' THEN 1 ELSE 0 END AS collect,
    #         CASE commit_status
    #         WHEN 'YES' THEN 1.5 ELSE 0 END AS commit_status,
    #         CASE
    #         WHEN score>50 THEN 1 ELSE 0 END AS score
    #         FROM account_course5000
    #         WHERE duration > 10
    #         AND UNIX_TIMESTAMP(update_time) > UNIX_TIMESTAMP('{0}')'''.format(d)
    sql_select_course = DataBaseQuery["online_predeal_select_course"].format(d)

    sql_insert_course_dr = '''INSERT INTO course_dr5000_changed(user_id, course_index, recommend_value)
                        VALUES (%s, %s, %s)'''
    # sql_insert_course_dr = DataBaseQuery["online_predeal_insert_course_dr"]

    sql_clean_course_dr = 'truncate table course_dr5000_changed;'
    # sql_clean_course_dr = DataBaseQuery["online_clean_course_dr"]

    dbHandle = DatabaseIo()
    if not dbHandle:
        return None

    result = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                            sql=sql_select_course)

    userCourseList = list()
    for row in result:
        user_id, course_id, time, collect, commit, score = row
        count = (3 * time + 2 * commit + collect + score) / 8
        userCourseList.append(tuple([user_id, course_id, count]))

    insertTuple = tuple(userCourseList)

    dbHandle.doSql(DataBaseOperateType.InsertOne, sql_clean_course_dr)
    dbHandle.changeCloseFlag()
    dbHandle.doSql(DataBaseOperateType.InsertMany, sql_insert_course_dr,
                   insertTuple)
示例#20
0
def fusion():

    logging.warning(u"运行日志:在线模块数据融合")
    print("run fusion...")
    # differAllData = pd.read_csv('dislikeData.csv',names=['uid', 'cid', 'score'])
    # changedData = pd.read_csv('changedBigraph.csv',names=['uid', 'cid', 'score'])
    differAllData = pd.read_csv('../file_saved/dislikeData.csv',
                                names=['uid', 'cid', 'score'])
    changedData = pd.read_csv('../file_saved/changedBigraph.csv',
                              names=['uid', 'cid', 'score'])

    dbHandle = DatabaseIo()
    if not dbHandle:
        return None

    #找出两个df中uid和cid对相同的行,存到mergeData中
    mergeData = pd.merge(differAllData,
                         changedData,
                         on=['uid', 'cid'],
                         how='inner')
    print mergeData

    if not mergeData.empty:
        muid = mergeData['uid'].values.tolist()
        mcid = mergeData['cid'].values.tolist()
        #ct_list储存mergeData中对应用户-课程的点击数
        ct_list = []
        for i in range(len(muid)):
            # sql_select_click_times = '''select click_times
            #         FROM account_course5000
            #         WHERE user_id = '{0}'
            #         AND course_id = '{1}'
            #         '''
            sql_select_click_times = DataBaseQuery["online_select_click_times"]

            result = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                    sql=sql_select_click_times.format(
                                        muid[i], mcid[i]))
            if not result:
                ct_list.append(0)
            else:
                ct_list.append(float(result[0][0]))

        cmax = max(ct_list)
        #norm_ctlist为ct_list进行规范化后的结果,作为惩罚因子加到对应的推荐值上
        norm_ctlist = []
        for i in ct_list:
            norm_ctlist.append(1 - (i / (cmax + 1)))

        #dvalue融合了课程类别统计的推荐值,cvalue是增量数据进行二部图运算得到的推荐值
        # fvalue为dvalue、cvalue和惩罚因子norm_ctlist融合后的推荐值
        dvalue = mergeData['score_x'].values.tolist()
        cvalue = mergeData['score_y'].values.tolist()
        fvalue = map(lambda (a, b, c): a * 0.3 + b + c,
                     zip(cvalue, dvalue, norm_ctlist))

        #mergeData增量数据变为进行融合后的结果集
        mergeData.drop(['score_x', 'score_y'], axis=1, inplace=True)
        mergeData['score'] = fvalue

        #fusionData为增量数据和全量数据的融合数据
        #先在融合了类别因素的全量数据中添加mergeData,就会出现重复的用户-课程对,利用drop_duplicates去重并保留添加的mergeData数据
        fusionData = differAllData.append(mergeData)
        fusionData.drop_duplicates(subset=['uid', 'cid'],
                                   keep='last',
                                   inplace=True)
        fusionData.sort_values(by=['uid'], ascending=True, inplace=True)
        fusionData.reset_index(inplace=True)
        fusionData.drop(['index'], axis=1, inplace=True)
        print("fusion success")
    else:
        #增量数据与离线全量数据中没有相同的用户-课程对,不进行融合,只根据类别加分
        fusionData = differAllData
        print("no changed data..")

    # baseData = pd.read_csv('../resultToRoc.csv')
    baseData = pd.read_csv('../file_saved/resultToRoc.csv')
    updatedBaseData = baseData.append(fusionData)
    updatedBaseData.drop_duplicates(subset=['uid', 'cid'],
                                    keep='last',
                                    inplace=True)
    # updatedBaseData.to_csv('../resultToRoc.csv',index=None)
    updatedBaseData.to_csv('../file_saved/resultToRoc.csv', index=None)
    # updatedBaseData.to_csv('online.csv', header=None, index=None)
    updatedBaseData.to_csv('../file_saved/online.csv', header=None, index=None)

    return fusionData
示例#21
0
def getdata(FSLflag):
    print("get data")
    if FSLflag == False:
        train_data = pd.read_csv(
            '../DGL/ml-100k/ua.base',
            sep='\t',
            header=None,
            names=['user_id', 'item_id', 'rating', 'timestamp'])
        test_data = pd.read_csv(
            '../DGL/ml-100k/ua.test',
            sep='\t',
            header=None,
            names=['user_id', 'item_id', 'rating', 'timestamp'])
        user_data = pd.read_csv('../DGL/ml-100k/u.user',
                                sep='|',
                                header=None,
                                encoding='latin1')
        item_data = pd.read_csv('../DGL/ml-100k/u.item',
                                sep='|',
                                header=None,
                                encoding='latin1')
        test_data = test_data[
            test_data['user_id'].isin(train_data['user_id'])
            & test_data['item_id'].isin(train_data['item_id'])]

        train_data = train_data.values.tolist()
        test_data = test_data.values.tolist()
        user_data = user_data.values.tolist()
        item_data = item_data.values.tolist()
        # print item_data
        # print item_data
        return train_data, test_data, user_data, item_data
    else:
        dbHandle = DatabaseIo()
        if not dbHandle:
            return None
        sql_dr = DataBaseQuery["course_dr"]
        sql_course = DataBaseQuery["course_info"]
        sql_user = DataBaseQuery["user_id"]
        sql_classify = DataBaseQuery["classify_info"]
        result_dr = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                   sql=sql_dr)
        result_course = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                       sql=sql_course)
        result_classify = dbHandle.doSql(
            execType=DataBaseOperateType.SearchMany, sql=sql_classify)
        dbHandle.changeCloseFlag()
        result_user = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
                                     sql=sql_user)
        drList = formatDataByType(SetType.SetType_List, result_dr)
        user_data = formatDataByType(SetType.SetType_Set, result_user)
        item_data = formatDataByType(SetType.SetType_List, result_course)
        classify_data = formatDataByType(SetType.SetType_List, result_classify)
        dr_length = len(drList)
        testIDs = random.sample(range(1, dr_length), int(dr_length / 10))
        data = pd.DataFrame(drList)
        train_data = []
        test_data = []
        for index, row in data.iterrows():
            if (index + 1) in testIDs:
                test_data.append(row)
            else:
                train_data.append(row)
        return train_data, test_data, user_data, item_data, classify_data
示例#22
0
def fusion(y):
    logging.warning(u"运行日志:混合个性化推荐课程、热门课程、高评分课程")

    sql_select_popularcourse = DataBaseQuery["popular_select_course"]
    # sql_select_popularcourse = '''select course_id,click_times from popular_course

    sql_select_highscorecourse = DataBaseQuery["highscore_select_course"]
    # sql_select_highscorecourse = "select course_id,score from high_score_course "

    dbHandle = DatabaseIo()
    if not dbHandle:
        return None

    popular_course_demo = dbHandle.doSql(
        execType=DataBaseOperateType.SearchMany, sql=sql_select_popularcourse)
    high_score_course_demo = dbHandle.doSql(
        execType=DataBaseOperateType.SearchMany,
        sql=sql_select_highscorecourse)

    popular_course = list(popular_course_demo)
    high_score_course = list(high_score_course_demo)
    # popular_course = popular_courses()
    # high_score_course = high_score_courses()

    recommend_num = get_course_num(y)
    online_result = get_online_result()
    result = []
    for row in recommend_num:
        temp_courses = {}
        online_count = 0
        popular_count = 0
        high_score_count = 0

        for online_row in online_result:
            if online_count == row[1][2]:
                break
            # elif long(online_row[0]) == row[0]:
            elif online_row[0] == row[0]:
                # if long(online_row[1]) not in temp_courses.keys():
                if online_row[1] not in temp_courses.keys():
                    online_count = online_count + 1
                    # temp_courses[int(online_row[1])] = online_row[2]
                    temp_courses[online_row[1]] = online_row[2]

        popular_num = int((y - len(temp_courses)) / 2)
        for i in range(len(popular_course)):
            if popular_count == popular_num:
                break
            elif popular_course[i][0] not in temp_courses.keys():

                popular_count = popular_count + 1
                temp_courses[popular_course[i][0]] = -2

        high_score_num = y - len(temp_courses)
        for i in range(len(high_score_course)):
            if high_score_count == high_score_num:
                break
            elif high_score_course[i][0] not in temp_courses.keys():
                high_score_count = high_score_count + 1
                temp_courses[high_score_course[i][0]] = -1

        for key in temp_courses:
            temp = [row[0], key, float(temp_courses[key])]
            result.append(temp)
    result = sorted(result, key=lambda k: k[2], reverse=True)
    result_dataframe = DataFrame(result)
    # result_dataframe.to_csv('outputFusion.csv', index=None, header=None)
    result_dataframe.to_csv('file_saved/outputFusion2.csv',
                            index=None,
                            header=None)
    return result_dataframe, popular_course