def test(): """ 开始进行预测 :return: """ conn = get_conn() cursor = conn.cursor() # 初始化矩阵 user_dict, item_user = form_data() print("初始化矩阵完成") print(time.time()) sql = "select * from public.test_set" cursor.execute(sql) rows = cursor.fetchall() f = open("recommend_20.txt", "w", encoding="utf-8") i = 0 for row in rows: i += 1 if i % 100 == 0: print("完成:" + str(i)) user = row[7] classroute = row[2] rec_dict = recommend_by_user(user_dict, item_user, user, 20) f.writelines("%s\t%s\t%s\t%s\n" % (user, classroute[:-1], AName( classroute[-1]), json.dumps(rec_dict))) f.close() cursor.close() conn.close()
def test(algorithm, lam=0.5): conn = get_conn() cursor = conn.cursor() # 从md_test_set中读取数据 select_sql = r"select * from public.test_set" cursor.execute(select_sql) rows = cursor.fetchall() # 初始化二部图 graph = init_graph() # 写结果 if algorithm.__name__ == "hunhe": f = open("hunhe_"+str(lam) + ".txt", "w", encoding="utf-8") else: f = open(algorithm.__name__ + ".txt", "w", encoding="utf-8") for row in rows: user_id = row[7] classroute = row[2] classroutestr = row[3] if algorithm.__name__ == "hunhe": result = algorithm(graph, user_id, lam) else: result = algorithm(graph, user_id) f.write("%s\t%s\t%s\t%s\n" % (user_id,classroute[:-1] ,classroute[-1], str(result))) f.close() cursor.close() conn.close()
def gen_nodes_list(): """ 产生所有的节点列表 包括游客和景点 :return: """ conn = get_conn() cursor = conn.cursor() nodes_list = list() # 先添加游客节点 sql = "select * from public.route_0320" cursor.execute(sql) rows = cursor.fetchall() for row in rows: user_id = row[7] user_id = VName(user_id) nodes_list.append(user_id) # 添加景点节点 sql = "select * from public.node_1023" cursor.execute(sql) rows = cursor.fetchall() for row in rows: att_id = row[0] att_id = AName(att_id) nodes_list.append(att_id) return nodes_list
def init_graph(): conn = get_conn() cursor = conn.cursor() graph = networkx.Graph() # 添加 游客节点 VNodes = get_v_nodes() for node in VNodes: graph.add_node(node, bipartite=0) # 添加景点节点 ANodes = get_a_nodes() for node in ANodes: graph.add_node(node, bipartite=1) # 从ml_graph_set中读取数据 select_sql = r"select * from public.ml_graph_set" cursor.execute(select_sql) rows = cursor.fetchall() for row in rows: user_id = row[0] atrraction_id = row[1] atrraction_name = row[2] if (VName(user_id), AName(atrraction_id)) in graph.edges: graph[VName(user_id)][AName(atrraction_id)]["weight"] += 1 else: graph.add_edge(VName(user_id), AName(atrraction_id), weight=1) return graph
def init_set(): """ 从route表中取数据 并根据此初始化训练集和测试集 :return: """ conn = get_conn() cursor = conn.cursor() # 节点字典 node_dict = get_node_id_dict() # 获取数量 count_sql = r"select count(*) from public.route" cursor.execute(count_sql) number = cursor.fetchone()[0] print(number) # 取出所有元素 select_sql = r"select * from public.route order by id" cursor.execute(select_sql) rows = cursor.fetchall() train_number = int(0.9*number) test_number = number - train_number train_set = random.sample(rows, train_number) for row in rows: classroute = row[2] classroutestr = row[3] routetime = row[4] id = row[7] if id_in_set(train_set, row[7]): for i in range(len(classroute)): insert_sql = """ insert into public.md_train_set(user_id, attraction, attractionstr, visittime) values(%s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[i], node_dict[classroute[i]], routetime[i])) else: for i in range(len(classroute) - 1): insert_sql = """ insert into public.md_train_set(user_id, attraction, attractionstr, visittime) values(%s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[i], node_dict[classroute[i]], routetime[i])) insert_sql = """ insert into public.md_test_set(user_id, attraction, attractionstr, visittime) values(%s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[-1], node_dict[classroute[-1]], routetime[-1])) conn.commit() cursor.close() conn.close()
def init_matrix(): """ 根据训练集的数据 生成邻接矩阵 :return: """ conn = get_conn() cursor = conn.cursor() # 矩阵的维数 nodes_list = gen_nodes_list() n = len(nodes_list) # 初始化矩阵 matrix = np.mat(np.full((n, n), np.complex(0.0, 0.0))) sql = "select * from public.train_set" cursor.execute(sql) rows = cursor.fetchall() # 填充训练数据 for row in rows: classroute = row[2] user_id = VName(row[7]) user_index = nodes_list.index(user_id) for att_id in classroute: att = AName(att_id) att_index = nodes_list.index(att) matrix[user_index, att_index] = np.complex(0.0, 1.0) matrix[att_index, user_index] = np.complex(0.0, -1.0) sql = "select * from public.train_set" cursor.execute(sql) rows = cursor.fetchall() # 填充测试数据 sql = "select * from public.test_set" cursor.execute(sql) rows = cursor.fetchall() for row in rows: classroute = row[2] user_id = VName(row[7]) user_index = nodes_list.index(user_id) for att_id in classroute[:-1]: att = AName(att_id) att_index = nodes_list.index(att) matrix[user_index, att_index] = np.complex(0.0, 1.0) matrix[att_index, user_index] = np.complex(0.0, -1.0) return matrix
def get_a_nodes(): conn = get_conn() cursor = conn.cursor() sql = "select num from public.node_1023" cursor.execute(sql) rows = cursor.fetchall() a_list = list() for row in rows: att_id = AName(row[0]) a_list.append(att_id) cursor.close() conn.close() return set(a_list)
def get_v_nodes(): conn = get_conn() cursor = conn.cursor() sql = "select id from public.route_0320" cursor.execute(sql) rows = cursor.fetchall() id_list = list() for row in rows: user_id = VName(row[0]) id_list.append(user_id) cursor.close() conn.close() return set(id_list)
def predict(extract_fun): """ 对结果进行预测 :param extract_fun: :return: """ conn = get_conn() cursor = conn.cursor() # 恢复模型 clf = joblib.load("model_" + extract_fun.__name__ + ".pkl") # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("投影完成") print(time.time()) sql = "select * from public.ml_test_set" cursor.execute(sql) rows = cursor.fetchall() f = open("predict_" + extract_fun.__name__ + ".txt", "w", encoding="utf-8") for row in rows: user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] if extract_fun.__name__ == "extract_direct": feature = extract_direct(graph, user_id, att_id) elif extract_fun.__name__ == "extract_indirect": feature = extract_indirect(graph, prjv_graph, prja_graph, user_id, att_id) result = clf.predict([feature])[0] f.writelines("%s\t%s\t%s\t%s\n" % (user_id, att_id, is_link, result)) f.close() cursor.close() conn.close()
def init_graph(): conn = get_conn() cursor = conn.cursor() graph = networkx.Graph() # 添加 游客节点 VNodes = get_v_nodes() for node in VNodes: graph.add_node(node, bipartite=0) # 添加景点节点 ANodes = get_a_nodes() for node in ANodes: graph.add_node(node, bipartite=1) sql = "select * from public.train_set" cursor.execute(sql) rows = cursor.fetchall() for row in rows: user_id = row[7] classroute = row[2] classroutestr = row[3] for atrraction_id in classroute: if (VName(user_id), AName(atrraction_id)) in graph.edges: graph[VName(user_id)][AName(atrraction_id)]["weight"] += 1 else: graph.add_edge(VName(user_id), AName(atrraction_id), weight=1) sql = "select * from public.test_set" cursor.execute(sql) rows = cursor.fetchall() for row in rows: user_id = row[7] classroute = row[2] classroutestr = row[3] for atrraction_id in classroute[:-1]: if (VName(user_id), AName(atrraction_id)) in graph.edges: graph[VName(user_id)][AName(atrraction_id)]["weight"] += 1 else: graph.add_edge(VName(user_id), AName(atrraction_id), weight=1) return graph
def get_node_loc_dict(): """ 景点id 及其对应的经纬度 :return: """ conn = get_conn() cursor = conn.cursor() sql = "select * from public.node_1023" cursor.execute(sql) rows = cursor.fetchall() node_loc_dict = dict() for row in rows: att_id = AName(row[0]) lon = row[2] lat = row[3] node_loc_dict[att_id] = (lon, lat) return node_loc_dict
def init_set(): """ 从route表中取数据 并根据此初始化训练集和测试集 :return: """ conn = get_conn() cursor = conn.cursor() # 获取数量 count_sql = r"select count(*) from public.route_0320" cursor.execute(count_sql) number = cursor.fetchone()[0] print(number) # 取出所有元素 select_sql = r"select * from public.route_0320 order by id" cursor.execute(select_sql) rows = cursor.fetchall() train_number = int(0.9 * number) test_number = number - train_number train_set = random.sample(rows, train_number) for row in rows: if id_in_set(train_set, row[7]): insert_sql = """ insert into public.train_set(id_base64,route, classroute, classroutestr, routetime, starttime, endtime, id, route_length) values(%s, %s, %s, %s, %s, %s, %s, %s, %s); """ else: insert_sql = """ insert into public.test_set(id_base64,route, classroute, classroutestr, routetime, starttime, endtime, id, route_length) values(%s, %s, %s, %s, %s, %s, %s, %s, %s); """ cursor.execute(insert_sql, row) conn.commit() cursor.close() conn.close()
def cal_train_distance(save_dis_file="train_distance.csv"): """ 计算 训练集 的距离特征 并保存 :param save_dis_file: 保存训练集距离特征的文件 :return: """ conn = get_conn() cursor = conn.cursor() # 节点对应的经纬度 node_loc_dict = get_node_loc_dict() sql = "select * from public.ml_train_set" cursor.execute(sql) rows = cursor.fetchall() distance = [] title = ["min_d", "mean_d", "max_d", "last_d"] for row in rows: user_id = VName(row[0]) att_id = AName(row[1]) sql = "select * from public.route_0320 where id={user_id}".format( user_id=row[0]) cursor.execute(sql) item = cursor.fetchone() classroute = item[2][:-1] dis_feature = get_dis_feature(node_loc_dict, classroute, att_id) distance.append(dis_feature) # 写入到csv文件中 df = pd.DataFrame(distance, columns=title) df.to_csv(save_dis_file, encoding="utf-8")
def init_set(): """ 从route表中取数据 并根据此初始化训练集和测试集 :return: """ conn = get_conn() cursor = conn.cursor() # 节点字典 node_dict = get_node_id_dict() all_nodes = list(node_dict.keys()) # 获取数量 count_sql = r"select count(*) from public.train_set" cursor.execute(count_sql) number = cursor.fetchone()[0] print(number) # 从train_set中读数据 加入ml_train_set和ml_graph_set中 sql = "select * from public.train_set" cursor.execute(sql) rows = cursor.fetchall() train_number = int(0.2 * number) train_set = random.sample(rows, train_number) for row in rows: classroute = row[2] classroutestr = row[3] routetime = row[4] id = row[7] if not id_in_set(train_set, row[7]): for i in range(len(classroute)): insert_sql = """ insert into public.ml_graph_set(user_id, attraction, attractionstr, visittime, islink) values(%s, %s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[i], node_dict[classroute[i]], routetime[i], True)) else: for i in range(len(classroute) - 1): insert_sql = """ insert into public.ml_graph_set(user_id, attraction, attractionstr, visittime, islink) values(%s, %s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[i], node_dict[classroute[i]], routetime[i], True)) insert_sql = """ insert into public.ml_train_set(user_id, attraction, attractionstr, visittime, islink) values(%s, %s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[-1], node_dict[classroute[-1]], routetime[-1], True)) left_nodes = set(all_nodes) - set(classroute) neg_examples = random.sample(left_nodes, 2) for neg in neg_examples: cursor.execute(insert_sql, (id, neg, node_dict[neg], None, False)) conn.commit() # 从test_set中读数据 加入ml_test_set和ml_graph_set中 sql = "select * from public.test_set" cursor.execute(sql) rows = cursor.fetchall() for row in rows: classroute = row[2] classroutestr = row[3] routetime = row[4] id = row[7] for i in range(len(classroute) - 1): insert_sql = """ insert into public.ml_graph_set(user_id, attraction, attractionstr, visittime, islink) values(%s, %s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[i], node_dict[classroute[i]], routetime[i], True)) insert_sql = """ insert into public.ml_test_set(user_id, attraction, attractionstr, visittime, islink) values(%s, %s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[-1], node_dict[classroute[-1]], routetime[-1], True)) left_nodes = set(all_nodes) - set(classroute) neg_examples = random.sample(left_nodes, 2) for neg in neg_examples: cursor.execute(insert_sql, (id, neg, node_dict[neg], None, False)) conn.commit() cursor.close() conn.close()
def recommend_list(route_length=3, coeffs=[]): """ :param route_length: :return: """ nodes_list = gen_nodes_list() matrix = init_matrix() conn = get_conn() cursor = conn.cursor() if route_length == 3: final_matrix = (matrix ** 3) elif route_length == 5: if coeffs: final_matrix = (matrix ** 3) + (matrix ** 5) / coeffs[0] else: final_matrix = (matrix ** 3) + (matrix ** 5) / 20 elif route_length == 7: if coeffs: final_matrix = (matrix ** 3) + (matrix ** 5) / coeffs[0] + (matrix ** 7) / coeffs[1] else: final_matrix = (matrix ** 3) + (matrix ** 5) / 120 + (matrix ** 7) / 5040 else: print("暂时没有对应的公式") return sql = "select * from public.test_set" cursor.execute(sql) rows = cursor.fetchall() file_name = "cn_route_" + str(route_length) + ".txt" f = open(file_name, "w", encoding="utf-8") # 所有的景点 a_nodes = list(get_node_id_dict().keys()) # 用于记录所有测试案例的推荐结果 record_result = dict() for row in rows: classroute = row[2] user_id = VName(row[7]) user_index = nodes_list.index(user_id) # 剩余的 待推荐景点 left_nodes = list(set(a_nodes) - set(classroute[:-1])) result_dict = dict() for node in left_nodes: node = AName(node) att_index = nodes_list.index(node) imag_coeff = final_matrix[user_index, att_index].imag if imag_coeff == 0.0: continue else: result_dict[node] = imag_coeff # 对结果进行排序 result_dict = dict(sorted(result_dict.items(), key=lambda x:x[1], reverse=True)) # 记录推荐结果 f.write("%s\t%s\t%s\t%s\n" % (user_id, classroute[:-1], AName(classroute[-1]), json.dumps(result_dict))) record_result[user_id] = {"user_id":user_id, "classroute":classroute[-1], "answer":AName(classroute[-1]), "recommend":result_dict} return record_result
def write_train_feature(func, have_sd=0): """ 生成训练集特征 并写入文件 :param func: :param have_sd: 间接特征 是否含有最短距离 :return: """ if have_sd: file_name = func.__name__ + "_has_sd_train.csv" else: file_name = func.__name__ + "_train.csv" # 读取数据 conn = get_conn() cursor = conn.cursor() # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("网络投影完成") print(time.time()) sql = "select * from public.ml_train_set" cursor.execute(sql) rows = cursor.fetchall() print("len_rows:" + str(len(rows))) if func.__name__ == "extract_direct": title = ["label", "snv", "sna", "cn", "jc", "aa", "pa", "sd"] elif func.__name__ == "extract_indirect": if have_sd: title = [ "label", "snv", "sna", "cn", "jc", "aa", "pa", "sd", "prj_cnv", "prj_cna", "prj_jcv", "prj_jca", "prj_aav", "prj_aaa", "prj_pav", "prj_paa", "prj_sdv", "prj_sda" ] else: title = [ "label", "snv", "sna", "cn", "jc", "aa", "pa", "sd", "prj_cnv", "prj_cna", "prj_jcv", "prj_jca", "prj_aav", "prj_aaa", "prj_pav", "prj_paa" ] else: print("函数错误") return i = 0 train_f = [] for row in rows: i += 1 print(i) user_id = VName(row[0]) att_id = AName(row[1]) if func.__name__ == "extract_direct": feature = func(graph, user_id, att_id) elif func.__name__ == "extract_indirect": feature = func(graph, prjv_graph, prja_graph, user_id, att_id, have_sd) else: print("函数错误") if row[4]: line = [1] line.extend(feature) train_f.append(line) else: line = [0] line.extend(feature) train_f.append(line) # 写入到csv文件中 df = pd.DataFrame(train_f, columns=title) df.to_csv(file_name, encoding="utf-8") print("训练特征保存完成")
def write_test_feature(func, has_sd=0): """ 保存测试集的特征 :param func: :return: """ conn = get_conn() cursor = conn.cursor() if has_sd: file_name = func.__name__ + "_has_sd_test.csv" else: file_name = func.__name__ + "_test.csv" if func.__name__ == "extract_direct": title = ["anode", "snv", "sna", "cn", "jc", "aa", "pa", "sd"] elif func.__name__ == "extract_indirect": if has_sd: title = [ "anode", "snv", "sna", "cn", "jc", "aa", "pa", "sd", "prj_cnv", "prj_cna", "prj_jcv", "prj_jca", "prj_aav", "prj_aaa", "prj_pav", "prj_paa", "prj_sdv", "prj_sda" ] else: title = [ "anode", "snv", "sna", "cn", "jc", "aa", "pa", "sd", "prj_cnv", "prj_cna", "prj_jcv", "prj_jca", "prj_aav", "prj_aaa", "prj_pav", "prj_paa" ] else: print("函数错误") return # 读数据 sql = "select * from public.ml_test_set" cursor.execute(sql) rows = cursor.fetchall() # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("投影完成") print(time.time()) # 所有的景点 a_nodes = list(get_node_id_dict().keys()) test_f = [] i = 0 for row in rows: print(i) i += 1 user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] if not is_link: continue sql = "select classroute from public.route_0320 where id={user_id}".format( user_id=row[0]) cursor.execute(sql) result = cursor.fetchone() classroute = result[0] # 待预测的集合 left_set = set(a_nodes) - set(classroute[0:-1]) for anode in left_set: anode = AName(anode) if func.__name__ == "extract_direct": feature = func(graph, user_id, anode) elif func.__name__ == "extract_indirect": feature = func(graph, prjv_graph, prja_graph, user_id, anode, has_sd) else: print("函数名错误") line = [anode] line.extend(feature) test_f.append(line) # 写入到csv文件中 df = pd.DataFrame(test_f, columns=title) df.to_csv(file_name, encoding="utf-8") print("测试特征保存完成")
def correct_data(): conn = get_conn() cursor = conn.cursor() select_sql = r"select * from public.route" cursor.execute(select_sql) rows = cursor.fetchall() for row in rows: line = dict() line["id_base64"] = row[0] line["route"] = row[1] line["classroute"] = row[2] line["classroutestr"] = row[3] line["routetime"] = row[4] line["starttime"] = row[5] line["endtime"] = row[6] line["id"] = row[7] line["route_length"] = row[8] # 轨迹为空 跳过 if not line["classroute"]: continue classroute = [line["classroute"][0]] classroutestr = [line["classroutestr"][0]] routetime = [line["routetime"][0]] for i in range(len(line["classroute"]) - 1): if line["classroute"][i + 1] not in classroute: classroute.append(line["classroute"][i + 1]) classroutestr.append((line["classroutestr"][i + 1])) routetime.append(line["routetime"][i + 1]) else: for j in range(len(classroute)): if classroute[j] == line["classroute"][i + 1]: index = j if is_same_day(routetime[index], line["routetime"][i + 1]): continue else: classroute.append(line["classroute"][i + 1]) classroutestr.append((line["classroutestr"][i + 1])) routetime.append(line["routetime"][i + 1]) line["classroute"] = classroute line["classroutestr"] = classroutestr line["routetime"] = routetime insert_sql = """ insert into public.route_0320(id_base64,route, classroute, classroutestr, routetime, starttime, endtime, id, route_length) values(%s, %s, %s, %s, %s, %s, %s, %s, %s); """ cursor.execute( insert_sql, (line["id_base64"], line["route"], line["classroute"], line["classroutestr"], line["routetime"], line["starttime"], line["endtime"], line["id"], line["route_length"])) conn.commit() cursor.close() conn.close()
#! usr/bin/env python3 # -*- coding:utf-8 -*- import sys sys.path.append("../") from postgresql import get_conn, get_node_id_dict if __name__=="__main__": conn = get_conn() cursor = conn.cursor() print(get_node_id_dict())
def form_data(): """ 初始化数据 填充item_user和user_dict :return: """ conn = get_conn() cursor = conn.cursor() # 从数据库中读取数据 sql = "select * from public.train_set" cursor.execute(sql) rows = cursor.fetchall() # 分别是物品表和用户表 item_user = dict() user_dict = dict() for row in rows: user_id = row[7] classroute = row[2] user_dict[user_id] = dict() for att in classroute: if att in user_dict[user_id].keys(): user_dict[user_id][att] += 1 else: user_dict[user_id][att] = 1 if att not in item_user.keys(): item_user[att] = {user_id: 1} else: if user_id in item_user[att].keys(): item_user[att][user_id] += 1 else: item_user[att][user_id] = 1 # 从test_set中读取数据 不记录路径的最后一个 sql = "select * from public.test_set" cursor.execute(sql) rows = cursor.fetchall() for row in rows: user_id = row[7] classroute = row[2] user_dict[user_id] = dict() for att in classroute[:-1]: if att in user_dict[user_id].keys(): user_dict[user_id][att] += 1 else: user_dict[user_id][att] = 1 if att not in item_user.keys(): item_user[att] = {user_id: 1} else: if user_id in item_user[att].keys(): item_user[att][user_id] += 1 else: item_user[att][user_id] = 1 cursor.close() conn.close() return user_dict, item_user
def train(extract_fun): """ 训练模型 :param: extract_fun :return: """ # 读取数据 conn = get_conn() cursor = conn.cursor() # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("网络投影完成") print(time.time()) sql = "select * from public.ml_train_set" cursor.execute(sql) rows = cursor.fetchall() print(len(rows)) # 保存训练数据 X_list = list() Y_list = list() i = 0 for row in rows: user_id = VName(row[0]) att_id = AName(row[1]) i += 1 print(i) # print(time.time()) if extract_fun.__name__ == "extract_direct": feature = extract_direct(graph, user_id, att_id) elif extract_fun.__name__ == "extract_indirect": feature = extract_indirect(graph, prjv_graph, prja_graph, user_id, att_id) # print(feature) else: print("wrong function") break X_list.append(feature) if row[4]: Y_list.append(1) else: Y_list.append(-1) print("生成训练数据") print(time.time()) cursor.close() conn.close() # 记录X_list, Y_list f = open("param_" + extract_fun.__name__ + ".txt", "w", encoding="utf-8") f.writelines(json.dumps(X_list) + "\n") f.writelines(json.dumps(Y_list) + "\n") f.close() print("训练数据保存成功") print(time.time()) clf = svm.SVC(kernel="linear") clf.fit(X_list, Y_list) print("训练数据结束") print(time.time()) joblib.dump(clf, "model_" + extract_fun.__name__ + ".pkl") print("保存模型") print(time.time())
def recommend_test(extract_fun, tuned_params): """ 根据GridSearchCV求得的参数 检验调参结果 :param tuned_params: :return: """ conn = get_conn() cursor = conn.cursor() # 读数据 sql = "select * from public.ml_test_set" cursor.execute(sql) rows = cursor.fetchall() # 构建模型 clf = svm.SVC(kernel=tuned_params["kernel"], C=tuned_params["C"], gamma=tuned_params["gamma"]) f = open("param_" + extract_fun.__name__ + ".txt", "r", encoding="utf-8") x_list = f.readline() x_list.split() y_list = f.readline() y_list.split() f.close() x_list = json.loads(x_list) y_list = json.loads(y_list) clf.fit(x_list, y_list) # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("投影完成") print(time.time()) # 所有的景点 a_nodes = list(get_node_id_dict().keys()) # 记录结果数据 f = open("recommend_" + extract_fun.__name__ + "_" + tuned_params["kernel"] + "_C" + str(tuned_params["C"]) + "_gamma" + str(tuned_params["gamma"]) + ".txt", "w", encoding="utf-8") i = 0 for row in rows: i += 1 print(i) user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] if not is_link: continue sql = "select classroute from public.route_0320 where id={user_id}".format( user_id=row[0]) cursor.execute(sql) result = cursor.fetchone() classroute = result[0] # 待预测的集合 left_set = set(a_nodes) - set(classroute[0:-1]) recommendation = dict() for anode in left_set: anode = AName(anode) if extract_fun.__name__ == "extract_direct": feature = extract_direct(graph, user_id, anode) elif extract_fun.__name__ == "extract_indirect": feature = extract_indirect(graph, prjv_graph, prja_graph, user_id, anode) result = clf.predict([feature])[0] dis = abs(clf.decision_function([feature])) if result == 1: recommendation[anode] = dis[0] recommendation = dict( sorted(recommendation.items(), key=lambda x: x[1], reverse=True)) f.write("%s\t%s\t%s\t%s\n" % (user_id, classroute[:-1], att_id, json.dumps(recommendation))) f.close() cursor.close() conn.close()
def recommend_list(extract_fun): """ 利用之前生成的模型 进行推荐 :param extract_fun: :return: """ conn = get_conn() cursor = conn.cursor() # 读数据 sql = "select * from public.ml_test_set" cursor.execute(sql) rows = cursor.fetchall() # 读模型 clf = joblib.load("model_" + extract_fun.__name__ + ".pkl") # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("投影完成") print(time.time()) # 所有的景点 a_nodes = list(get_node_id_dict().keys()) # 记录结果数据 f = open("recommend_" + extract_fun.__name__ + ".txt", "w", encoding="utf-8") for row in rows: user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] if not is_link: continue sql = "select classroute from public.route_0320 where id={user_id}".format( user_id=row[0]) cursor.execute(sql) result = cursor.fetchone() classroute = result[0] # 待预测的集合 left_set = set(a_nodes) - set(classroute[0:-1]) recommendation = dict() for anode in left_set: anode = AName(anode) if extract_fun.__name__ == "extract_direct": feature = extract_direct(graph, user_id, anode) elif extract_fun.__name__ == "extract_indirect": feature = extract_indirect(graph, prjv_graph, prja_graph, user_id, anode) result = clf.predict([feature])[0] dis = abs(clf.decision_function([feature])) if result == 1: recommendation[anode] = dis[0] recommendation = dict( sorted(recommendation.items(), key=lambda x: x[1], reverse=True)) f.write("%s\t%s\t%s\t%s\n" % (user_id, classroute[:-1], att_id, json.dumps(recommendation))) f.close() cursor.close() conn.close()
def recommend(test_file, model_path, save_file): """ 根据上面生成的模型 进行预测推荐 :param model_path: :return: """ # 加载模型 model = xgb.Booster() model.load_model(model_path) # print(dir(model)) # 构造训练集数据 test_data = pd.read_csv(test_file) anode_list = list(test_data["anode"]) test_set = test_data.drop("anode", axis=1) # xgb矩阵赋值 xgb_test = xgb.DMatrix(test_set) # 进行预测 preds = model.predict(xgb_test) conn = get_conn() cursor = conn.cursor() sql = "select * from public.ml_test_set " cursor.execute(sql) rows = cursor.fetchall() # 所有的景区节点 a_nodes = list(get_node_id_dict().keys()) index = 0 f = open(save_file, "w", encoding="utf-8") for row in rows: user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] # 对于测试的负案例 直接跳过 if not is_link: continue sql = "select classroute from public.route_0320 where id={user_id}".format(user_id=row[0]) cursor.execute(sql) result = cursor.fetchone() classroute = result[0] # 待预测的集合 left_set = set(a_nodes) - set(classroute[0:-1]) n = len(left_set) recommend_list = dict() for i in range(n): recommend_list[anode_list[index+i]] = preds[index+i] index += n recommend_list = dict(sorted(recommend_list.items(), key=lambda x:x[1], reverse=True)) f.write("%s\t%s\t%s\t%s\n" % (user_id, classroute[:-1], att_id, json.dumps(recommend_list, cls=MyEncoder))) f.close() cursor.close() conn.close()