def __init__(self): self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) self.pms_dao = PmsDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) self.hotelname = '南京天丰大酒店' # 房间推荐时 的 阈值 ;房间推荐是 先综合所有历史评论,更新到 房间表后(每一项的评分)再 按此分数 推荐 self.keyvalue = 0.1 # 质检 的 阈值 ; 实时爬取 remark表里的 self.redvalue = 0.03 self.yellowvalue = 0.1
def __init__(self): HotelService.__init__(self) # 携程dao self.xiechengDao = xiechengDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) # 酒店dao self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) # 存放列表页数据 self.listPageInfo = [] # 存放酒店详情数据 self.hotelItem = {} # 存放酒店评论数据 self.commList = [] # 存储床价信息 self.bed = {} # 当前ota名称 self.__ota_info = "携程"
def __init__(self): print '开始爬取艺龙' HotelService.__init__(self) self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) self.dao = ElongDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) self.hotelNLP = HotelNLP() self.listPageInfo = [] self.hotelItem = {} self.commList = [] self.priceList = [] self.ifCrawlHotelInfo = True self.__ota_info = "艺龙"
def __init__(self): HotelService.__init__(self) # 酒店dao self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) # 途牛dao self.dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) # 自然语言处理 self.hotelNLP = HotelNLP() # 存放列表页数据 self.listPageInfo = [] # 存放酒店详情数据 self.hotelItem = {} # 存放酒店评论数据 self.commList = [] # 存放酒店价格数据 self.priceList = [] self.ifCrawlHotelInfo = True self.__ota_info = "途牛"
__author__ = 'DreamCathcer' import uuid import re import traceback from dao.hotel.HotelDAO import HotelDAO from dao.hotel.xiechengdao.xiecheng import xiechengDAO from dao.hotel.TuniuDao import TuniuDAO from setting import local_hotel_setting # 配置数据库 dao_setting = local_hotel_setting hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) tuniu_dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) xiecheng_dao = xiechengDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) # tuniu_comm = tuniu_dao.get_remarks() # # hotel_comm = [] # i = 0 # for comm in tuniu_comm: # i+=1 # print i # baseinfo = hotel_dao.get_baseinfo_by_hotelname(comm[10].encode('utf-8'), '南京') # for info in baseinfo: # if info[3].encode('utf-8') == '途牛': # hotel_comm.append({ # "guid":uuid.uuid1(),
def __init__(self): self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) self.dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"])
class HotelSentimentProcessor(object): def __init__(self): self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) self.dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) def set_sentiment_and_viewpoint(self): self.hotelnlp = HotelNLP() self.thu = thulac("") comm_list = self.hotel_dao.get_remarks() print len(comm_list) sentiment_comm_list = [] i = 0 for comm in comm_list: if comm[8] is None or comm[9] is None: sentiment_value = None viewpoint = None remark = re.sub(u"\@",u"",comm[2]) try: sentiment_value = self.hotelnlp.sentiment(remark.encode("utf-8")) sentiment_value = round(sentiment_value*1000)/1000 print sentiment_value except: print comm[2] traceback.print_exc() try: viewpoint = self.hotelnlp.viewpoint(remark.encode("utf-8"),decoding="utf-8") viewpoint = json.dumps(viewpoint, ensure_ascii=False) except: print remark traceback.print_exc() comm = {"guid":comm[0], "senti_value":sentiment_value, "viewpoint":viewpoint} sentiment_comm_list.append(comm) if len(sentiment_comm_list)==10000: i+=1 print "update %d time"%i self.hotel_dao.update_remarks(sentiment_comm_list) sentiment_comm_list = [] def count_word_frq(self): self.thu = thulac("-input cs.txt") comm_list = self.hotel_dao.get_remarks() sentiment_comm_list = [] i = 0 for comm in comm_list: a_dict = {} try: cut_comm = map(lambda x: x.split("_"), self.thu.cut(comm[2].encode("utf-8"))) except: cut_comm = [] print comm[2] traceback.print_exc() for word in cut_comm: if word[1].decode("utf-8") == "a": if word[0].decode("utf-8") not in a_dict: a_dict[word[0].decode("utf-8")] = 1 else: a_dict[word[0].decode("utf-8")] += 1 comm = {"guid":comm[0], "word_freq":json.dumps(a_dict, ensure_ascii=False)} sentiment_comm_list.append(comm) if len(sentiment_comm_list)==10000: i+=1 print "update %d time"%i self.hotel_dao.update_hotel_comm_word_freq(sentiment_comm_list) sentiment_comm_list = []
def __init__(self): self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"])
class HotelDataService(object): def __init__(self): self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) ''' 检查登录用户账号密码的合法性 ''' def check_user(self, user_name, password): userdata = self.hotel_dao.get_user(user_name) result_data = {"baseinfo": [], 'location': {}, 'user': {}} if len(userdata) < 1: return None for line in userdata: result_data['location']['location_id'] = line[0] result_data['location']['x'] = line[1] result_data['location']['y'] = line[2] result_data['location']['hotel_name'] = line[3] result_data['location']['city'] = line[4] result_data['location']['address'] = line[5] result_data['baseinfo'].append({ "id": line[6], "url": line[7], "OTA": line[9], "comm_num": line[10], "img": line[13] }) result_data['user']['id'] = line[15] result_data['user']['user_name'] = line[16] result_data['user']['password'] = line[17] result_data['user']['corporation'] = line[19] result_data['user']['img'] = line[20] if password != result_data['user']['password']: return None else: return result_data def get_baseinfo_by_location_id(self, location_id_str): location_id_list = location_id_str.split(',') result_data = [] for location_id in location_id_list: baseinfo = self.hotel_dao.get_baseinfo_by_location_id(location_id) ota_list = [] for info in baseinfo: ota_list.append({ "id": info[0], "location_id": info[2], "ota": info[3] }) result_data.append(ota_list) return result_data def get_comm_type_score_statics(self, baseinfo_id, ota): result_data = None if ota == u'携程': result_data = self.hotel_dao.get_comm_score_statics(baseinfo_id) elif ota == u'途牛': result_data = self.hotel_dao.get_comm_type_statics(baseinfo_id) return result_data ''' 通过酒店名获取酒店的实体观点 ''' def get_comm_viewpoints(self, hotel_name_str): hotel_name_list = hotel_name_str.split(',') viewpoints = [] for i in range(0, len(hotel_name_list)): viewpoint = self.get_comm_viewpoint(hotel_name_list[i]) viewpoints.append({ "hotel_name": hotel_name_list[i], "viewpoint": viewpoint }) return viewpoints ''' 对酒店的评论进行统计,得到酒店实体观点 ''' def get_comm_viewpoint(self, hotel_name): viewpoint_statics = {} comments = self.hotel_dao.get_remarks_by_hotel_name( hotel_name.encode('utf-8')) for comment in comments: try: # 反序列化字符串 viewpoint = json.loads(comment[9]) # 遍历key值 for key in viewpoint: if key in viewpoint_statics: viewpoint_statics[key] = (viewpoint_statics[key] + viewpoint[key]) / 2 else: viewpoint_statics[key] = viewpoint[key] except: traceback.print_exc() continue return viewpoint_statics ''' 酒店评论形容词统计 ''' def get_comm_adjective_statics(self, baseinfo_id): adjective_statics = {} comments = self.hotel_dao.get_remarks_by_baseinfo_id(baseinfo_id) for comment in comments: # 反序列化字符串 try: adjectives = json.loads(comment[10]) # 遍历key值 for key in adjectives: if key in adjective_statics: adjective_statics[ key] = adjective_statics[key] + adjectives[key] else: adjective_statics[key] = adjectives[key] except: continue adjective_statics = sorted(adjective_statics.iteritems(), key=lambda asd: asd[1], reverse=True) return adjective_statics def get_user_flow_to_html(self, hotel_name, base_info_id_str, page, count=10, start_time=None, end_time=None, ring_str=None): baseinfo_id_list = base_info_id_str.split(',') result_data = {} html = "" for baseinfo_id in baseinfo_id_list: if baseinfo_id != "": data = self.get_trace(baseinfo_id, start_time, end_time, ring_str) for key in data["point"]: if data["point"][key]["name"] in hotel_name: continue if key not in result_data: result_data[key] = data["point"][key] else: result_data[key]["value"] += data["point"][key][ "value"] result_data = sorted(result_data.iteritems(), key=lambda asd: asd[1]["value"], reverse=True) for hotel in result_data[count * (page - 1):count * page]: html += "<tr><td name='name'>%s</td><td name='count'>%s</td><td name='price'>%s</td></tr>" % ( hotel[1]["name"], hotel[1]["value"], random.randint(340, 500)) return {"pageNum": len(result_data) / count + 1, "html": html} def get_user_trace(self, baseinfo_id_str, start_time=None, end_time=None, ring_str=None): baseinfo_id_list = baseinfo_id_str.split(',') result_data = [] for baseinfo_id in baseinfo_id_list: if baseinfo_id != "": data = self.get_trace(baseinfo_id, start_time, end_time, ring_str) result_data.append(data) return result_data def get_trace(self, baseinfo_id, start_time=None, end_time=None, ring_str=None): result_data = {} result_data["line"] = [] result_data["point"] = {} polygon = None user_list = self.hotel_dao.get_hotel_trace_users(baseinfo_id) # 遍历用户名 for user in user_list: # 获取该用户的评论数据(评论对应的酒店名和地点) remarks = self.hotel_dao.get_remarks_by_username(user[0]) # 生成轨迹线 for i in range(0, len(remarks) - 1): if ring_str is not None: if polygon is None: ring = json.loads(ring_str) polygon = Polygon(ring) p1 = Point(remarks[i][14], remarks[i][15]) p2 = Point(remarks[i + 1][14], remarks[i + 1][15]) # 如果轨迹点不在这个区域内,则不存储 if not polygon.intersects(p1) or not polygon.intersects( p2): continue if remarks[i][15] != remarks[i + 1][15]: start_point = { "name": remarks[i][13], "geoCoord": [remarks[i][14], remarks[i][15]] } end_point = { "name": remarks[i + 1][13], "geoCoord": [remarks[i + 1][14], remarks[i + 1][15]] } result_data["line"].append([start_point, end_point]) # 生成轨迹点 for remark in remarks: if ring_str is not None: if polygon is None: ring = json.loads(ring_str) polygon = Polygon(ring) p = Point(remark[14], remark[15]) # 如果该点不在这个区域内,则不存储 if not polygon.intersects(p): continue coord_str = str(remark[15]) + "," + str(remark[14]) if coord_str not in result_data["point"]: result_data["point"][coord_str] = { "name": remark[13], "geoCoord": [remark[14], remark[15]], "value": 1 } else: result_data["point"][coord_str]["value"] += 1 return result_data ''' 根据文本获取相关评论 ''' def get_comm_by_text(self, hotel_name, page, text=None, count=20, ota=None): comments = self.hotel_dao.get_remarks_by_text( hotel_name.encode("utf-8"), text, ota) comments_changed = [] # 遍历评论,加粗其中关于酒店实体的文字 text = text.decode("utf-8") if text != None else None for comm in comments[count * (page - 1):count * page]: comm = list(comm) viewpoint = json.loads(comm[9]) for feature in viewpoint: if feature != text: comm[2] = re.sub( feature, '<a title="' + '%.2f' % viewpoint[feature] + '" data-toggle="tooltip" href="#"><b>' + feature + '</b></a>', comm[2]) else: comm[2] = re.sub( feature, '<a title="' + '%.2f' % viewpoint[feature] + '" data-toggle="tooltip" href="#"><b> <span style="color:red">' + feature + '</span></b></a>', comm[2]) for adjective in json.loads(comm[9]): comm[2] = re.sub(adjective, '<b>' + adjective + '</b>', comm[2]) comments_changed.append(comm) return { "pageNum": len(comments) / 20 + 1, "comments_info": comments_changed } def get_location(self, location_id): data = self.hotel_dao.get_hotel_name_by_location_id(location_id) if len(data) == 1: location = data[0] return { 'data': { 'location_id': location[0], 'x': location[1], 'y': location[2], 'hotel_name': location[3], 'address': location[5] }, 'status': 200 } return {'status': 0}
def __init__(self): self._city = None self.__ota_info = "携程" self.xiecheng_api_client = XieChengAPIClient() self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"])
class HotelSentimentProcessor(object): def __init__(self): self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) self.dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) def set_sentiment_and_viewpoint(self): self.hotelnlp = HotelNLP() self.thu = thulac("") comm_list = self.hotel_dao.get_remarks() print len(comm_list) sentiment_comm_list = [] i = 0 for comm in comm_list: if comm[8] is None or comm[9] is None: sentiment_value = None viewpoint = None remark = re.sub(u"\@", u"", comm[2]) try: sentiment_value = self.hotelnlp.sentiment( remark.encode("utf-8")) sentiment_value = round(sentiment_value * 1000) / 1000 print sentiment_value except: print comm[2] traceback.print_exc() try: viewpoint = self.hotelnlp.viewpoint(remark.encode("utf-8"), decoding="utf-8") viewpoint = json.dumps(viewpoint, ensure_ascii=False) except: print remark traceback.print_exc() comm = { "guid": comm[0], "senti_value": sentiment_value, "viewpoint": viewpoint } sentiment_comm_list.append(comm) if len(sentiment_comm_list) == 10000: i += 1 print "update %d time" % i self.hotel_dao.update_remarks(sentiment_comm_list) sentiment_comm_list = [] def count_word_frq(self): self.thu = thulac("-input cs.txt") comm_list = self.hotel_dao.get_remarks() sentiment_comm_list = [] i = 0 for comm in comm_list: a_dict = {} try: cut_comm = map(lambda x: x.split("_"), self.thu.cut(comm[2].encode("utf-8"))) except: cut_comm = [] print comm[2] traceback.print_exc() for word in cut_comm: if word[1].decode("utf-8") == "a": if word[0].decode("utf-8") not in a_dict: a_dict[word[0].decode("utf-8")] = 1 else: a_dict[word[0].decode("utf-8")] += 1 comm = { "guid": comm[0], "word_freq": json.dumps(a_dict, ensure_ascii=False) } sentiment_comm_list.append(comm) if len(sentiment_comm_list) == 10000: i += 1 print "update %d time" % i self.hotel_dao.update_hotel_comm_word_freq(sentiment_comm_list) sentiment_comm_list = []
class XiechengDriverService(HotelService): def __init__(self): HotelService.__init__(self) # 携程dao self.xiechengDao = xiechengDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) # 酒店dao self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) # 存放列表页数据 self.listPageInfo = [] # 存放酒店详情数据 self.hotelItem = {} # 存放酒店评论数据 self.commList = [] # 存储床价信息 self.bed = {} # 当前ota名称 self.__ota_info = "携程" def crawlListPage(self): self.openPage( "http://hotels.ctrip.com/hotel/nanjing12#ctm_ref=hod_hp_sb_lst") self.driver.implicitly_wait(10) # 单页循环次数 loopNum = 0 # 标识当前页面是否已经爬取:False为未处理,反之为已处理 ifHandle = False # 获取总页面数 pageNum = 140 while (pageNum >= 1): # 循环次数加1 loopNum = loopNum + 1 # 到达页面90%处 # js="var q=document.documentElement.scrollTop=9600" # self.driver.execute_script(js) self.driver.find_element_by_tag_name("body").send_keys(Keys.END) self.driver.find_element_by_tag_name("body").send_keys( Keys.PAGE_UP) # 当页面中出现“返前价”字样时,爬取页面并跳转到下一页 if u"收藏" in self.driver.page_source: # 对未解析过的页面进行解析 if ifHandle == False: self.__crawllianjie(self.driver.page_source) print u"获取酒店数为:%d" % len(self.listPageInfo) ifHandle = True # 跳转到下一页 try: if u"下一页" in self.driver.page_source: self.driver.find_element_by_partial_link_text( u"下一页").click() #self.driver.find_element_by_xpath("//a[@class='c_down']").click() pageNum = pageNum - 1 # 处理标识重新置为未处理 ifHandle = False # 单页循环次数置为零 loopNum = 0 time.sleep(random.uniform(3, 6)) print u"页数:" + str(pageNum) except: print "error happen at clicking of nextpage" # 如果单页循环次数不为零,说明没有跳转到下一页 if loopNum != 0: # 循环次数较大的情况下(此处预定为15次)说明页面可能加载失败,跳出循环,否则继续循环获取 if loopNum < 15: time.sleep(3) continue else: break return False if pageNum > 1 else True # 爬取页面链接 def __crawllianjie(self, page_sourse): response = HtmlResponse(url="my HTML string", body=page_sourse, encoding="utf-8") hotel_list = response.xpath("//div[@class='searchresult_list ']/ul") for hotel in hotel_list: url = hotel.xpath( "li[@class='searchresult_info_name']/h2/a/@href").extract()[0] address = hotel.xpath( "li[@class='searchresult_info_name']/p[@class='searchresult_htladdress']/text()" ).extract()[0] commnum = hotel.xpath( "li[@class='searchresult_info_judge ']/div/a/span[@class='hotel_judgement']/text()" ).extract() if len(commnum): commnum = re.sub('\D', '', commnum[0]) commnum = commnum if len(commnum) > 0 else 0 else: commnum = 0 name = hotel.xpath( "li[@class='searchresult_info_name']/h2/a/text()").extract()[0] self.listPageInfo.append({ "guid": uuid.uuid1(), "url": url, "hotel_name": name, "OTA": self.__ota_info, "comm_num": int(commnum), "address": address }) ''' 保存爬取的酒店列表页数据 ''' def saveListPageInfo(self): baidu_api_service = BaiduMapAPIService("MviPFAcx5I6f1FkRQlq6iTxc") old_location_info = self.hotel_dao.get_locations(self._city) old_baseinfo = list( self.hotel_dao.get_baseinfo(self._city, self.__ota_info)) # 将基础数据中的if_overtime先假设为都已过时 for i in range(0, len(old_baseinfo)): old_baseinfo[i] = list(old_baseinfo[i]) old_baseinfo[i][5] = 1 new_locations = [] new_baseinfo = [] update_baseinfo = [] # 遍历将要保存的数据 for item in self.listPageInfo: location_id = None # 首先检查该酒店是否已经保存在location表中 for location in old_location_info: if item["hotel_name"] == location[3]: location_id = location[0] break # 如果没有则插入一条新的记录到location表中 if location_id == None: location_id = uuid.uuid1() geocoding_info = None while 1: try: geocoding_info = baidu_api_service.doGeocoding( item["address"], city=self._city) break except: time.sleep(0.5) continue if "result" not in geocoding_info: print item["hotel_name"] + "error" continue trans_location = CoordTransor.bd09togcj02( bd_lon=geocoding_info["result"]["location"]["lng"], bd_lat=geocoding_info["result"]["location"]["lat"]) print trans_location new_locations.append({ "guid": location_id, "x": trans_location[1], "y": trans_location[0], "hotel_name": item["hotel_name"], "city": self._city, "address": item["address"] }) # 根据location的id号到baseinfo表中查询 # 如果已经存于表中,则更新该条数据 # 如果没有,则插入一条新的数据 if_exist = False for baseinfo in old_baseinfo: if location_id == baseinfo[2]: if_exist = True baseinfo[1] = item["url"] baseinfo[4] = item["comm_num"] baseinfo[5] = 0 baseinfo[6] = item["comm_num"] - baseinfo[ 4] if item["comm_num"] - baseinfo[4] > 0 else 0 break if not if_exist: new_baseinfo.append({ "guid": item["guid"], "url": item["url"], "location_id": location_id, "OTA": self.__ota_info, "comm_num": item["comm_num"], "if_overtime": 0, "incre_num": item["comm_num"], }) for baseinfo in old_baseinfo: update_baseinfo.append({ "guid": baseinfo[0], "url": baseinfo[1], "location_id": baseinfo[2], "OTA": baseinfo[3], "comm_num": baseinfo[4], "if_overtime": baseinfo[5], "incre_num": baseinfo[6] }) print len(new_locations) print len(new_baseinfo) print len(update_baseinfo) self.hotel_dao.save_locations(new_locations) self.hotel_dao.save_baseinfo(new_baseinfo) self.hotel_dao.update_baseinfo(update_baseinfo) #self.dao.saveListPageInfo(self.listPageInfo) def depose(self): self.driver.close()
# -*- coding:utf-8 -*- __author__ = 'DreamCathcer' import uuid import re import traceback from dao.hotel.HotelDAO import HotelDAO from dao.hotel.xiechengdao.xiecheng import xiechengDAO from dao.hotel.TuniuDao import TuniuDAO from setting import local_hotel_setting # 配置数据库 dao_setting = local_hotel_setting hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) tuniu_dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) xiecheng_dao = xiechengDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) # tuniu_comm = tuniu_dao.get_remarks() # # hotel_comm = [] # i = 0 # for comm in tuniu_comm: # i+=1 # print i # baseinfo = hotel_dao.get_baseinfo_by_hotelname(comm[10].encode('utf-8'), '南京') # for info in baseinfo: # if info[3].encode('utf-8') == '途牛':
class PmsService(object): def __init__(self): self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) self.pms_dao = PmsDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) self.hotelname = '南京天丰大酒店' # 房间推荐时 的 阈值 ;房间推荐是 先综合所有历史评论,更新到 房间表后(每一项的评分)再 按此分数 推荐 self.keyvalue = 0.1 # 质检 的 阈值 ; 实时爬取 remark表里的 self.redvalue = 0.03 self.yellowvalue = 0.1 ''' 0 旨在更新和配置数据库 1 pms前台调用的服务 ''' ''' 1用户登录 ''' def user_login(self,username,password,usertype): result = self.pms_dao.user_login(username) print type(result) for re in result: if re[2] ==password or re[3]==usertype: return True return False ''' 1获取房间信息表 ''' def get_hotel_roominfos(self): result_data = self.hotel_dao.get_records('roominfo') return result_data ''' 1获取顾客信息表 ''' def get_hotel_guestinfos(self): result_data = self.hotel_dao.get_records('guestinfo') return result_data ''' 1基于用户评论 协同过滤 推荐房间 ''' def room_recommend(self,guestname,roomtype): #顾客的所有关注点 view_points = {} #顾客的差评关注点 小于value值的 review_points = [] #评分排序 toppoint = {} #取用户所有点评过的实体及其情感值 guest_remarks = self.pms_dao.get_guest_remark(guestname) for remark in guest_remarks: points = json.loads(remark[9]) for point,value in points.iteritems(): if point in view_points: if view_points[point] > value: view_points[point] = value else: view_points[point] = value #将情感值小于keyvalue的实体保存起来 for point,value in view_points.iteritems(): if value < self.keyvalue: review_points.append({ 'point':point, 'value':[] }) # rooms =self.pms_dao.get_records('roominfo') for point in review_points: for room in rooms: if point['point'] in room[5]: dictroompoint = json.loads(room[5]) point['value'].append(( room[0],dictroompoint[point['point']] )) else: continue #排序 point['value'].sort(key=lambda x:x[1],reverse=True) #统计房间排名 #print review_points for point in review_points: i = 0 for r in point['value']: room = str(r[0]) if room in toppoint: toppoint[room] = i + toppoint[room] else: toppoint[room] = i i += 1 if i > len(point['value']): break toppoint = sorted(toppoint.iteritems(),key=lambda x:x[1]) print toppoint # TODO 推荐列表已经计算完成,只要再结合 房间类型即可推荐 ''' 1添加用户入住记录,修改房间信息表 ''' def add_record(self,roomid,guestid,intime,outtime,charge): #判断用户是否在住,即存在于房间信息表中,如果不存在就执行下一步 if self.check_user_roominfo(guestid): return False else: #添加记录 #intime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') self.pms_dao.add_record(roomid,guestid,intime,outtime,charge) #修改房间 guestid self.pms_dao.alter_roominfo(roomid,guestid) return True ''' 1(质检)依据 楼层号 查询 各房间 的 状态 例如标红 ''' def query_floorstate(self,floornum,time): if len(floornum)==1: floornum = "0"+floornum remarklist = [] hotelid = self.pms_dao.query_hotelid_from_hotelname(self.hotelname) condition = "(roomid='%s')AND(" + " OR ".join(["baseinfo_id= '"+id[0]+"'" for id in hotelid])+ ")" # 依据楼层号,确定 rooms rooms = self.pms_dao.get_hotel_rooms(floornum) for room in rooms: redpoint = [] yellowpoint = [] greenpoint = [] viewpointlist = self.pms_dao.get_condition_column('viewpoint','remark',condition%(room[0])) for item in viewpointlist: for i in item: if i == None: continue dicti = json.loads(i) for k,v in dicti.iteritems(): #print k,v if v < self.redvalue: if k not in redpoint: redpoint.append(k) elif v < self.yellowvalue: if k not in yellowpoint: yellowpoint.append(k) else: if k not in greenpoint: greenpoint.append(k) remarklist.append([room[0], [redpoint,yellowpoint,greenpoint],[room[1],room[3],room[4]]]) return remarklist ''' 1(质检) 通过前台点击房间 传入 roomid 返回与房间相关的评论 ''' def get_room_remark(self,roomid): commentlist = [] hotelid = self.pms_dao.query_hotelid_from_hotelname(self.hotelname) condition = "(roomid='%s')AND(" + " OR ".join(["baseinfo_id= '"+id[0]+"'" for id in hotelid])+ ")" roomremark = self.pms_dao.get_condition_column('*','remark',condition%(roomid)) for remark in roomremark: comment = remark[2] viewpoint = json.loads(remark[9]) for point in viewpoint: #print point if point in comment: if viewpoint[point] < self.redvalue: comment = re.sub(point,'<a title="'+'%.2f'%viewpoint[point]+'" data-toggle="tooltip" href="#"><b> <span style="color:red">'+point+'</span></b></a>',comment) elif viewpoint[point] < self.yellowvalue: comment = re.sub(point,'<a title="'+'%.2f'%viewpoint[point]+'" data-toggle="tooltip" href="#"><b> <span style="color:yellow">'+point+'</span></b></a>',comment) else: comment = re.sub(point,'<a title="'+'%.2f'%viewpoint[point]+'" data-toggle="tooltip" href="#"><b>'+point+'</b></a>',comment) commentlist.append(comment) return commentlist ''' 1(质检)依据实体选择 获取remark ''' def get_remark_by_points(self,points,floornum): points = points.split(',') if len(floornum)==1: floornum = "0"+floornum remarklist = [] hotelid = self.pms_dao.query_hotelid_from_hotelname(self.hotelname) condition = "(roomid='%s')AND(" + " OR ".join(["baseinfo_id= '"+id[0]+"'" for id in hotelid])+ ")" # 依据楼层号,确定 rooms rooms = self.pms_dao.get_hotel_rooms(floornum) #循环房间 for room in rooms: #依据房间id 抽取评论 viewpointlist = self.pms_dao.get_condition_column('viewpoint,remark,roomid','remark',condition%(room[0])) #循环 获取到 的房间评论 for viewpoint in viewpointlist: #房间实体词 sqlpoint = json.loads(viewpoint[0]) #评论 comment = viewpoint[1] #循环 前端传过来的 实体 for point in points: point = unicode(point,"utf-8") #判断 前端实体 是否在 当前这个房间的 这条评论中,如果在就添加到 remarklist if point in sqlpoint: roomnum = self.pms_dao.get_condition_column('roomnum','roominfo',"roomid='%s'"%(viewpoint[2])) #处理 comment 中实体 添加title属性 if sqlpoint[point] < self.redvalue: comment = re.sub(point,'<a title="'+'%.2f'%sqlpoint[point]+'" data-toggle="tooltip" href="#"><b> <span style="color:red">'+point+'</span></b></a>',comment) elif sqlpoint[point] < self.yellowvalue: comment = re.sub(point,'<a title="'+'%.2f'%sqlpoint[point]+'" data-toggle="tooltip" href="#"><b> <span style="color:yellow">'+point+'</span></b></a>',comment) else: comment = re.sub(point,'<a title="'+'%.2f'%sqlpoint[point]+'" data-toggle="tooltip" href="#"><b>'+point+'</b></a>',comment) remarklist.append([roomnum[0][0],comment]) return remarklist ''' 1判断用户是否在住 ''' def check_user_roominfo(self,guestid): result = self.hotel_dao.get_records('roominfo') for re in result: if re[4]==guestid: return True return False pass ''' 0随机更新remark表的roomid ''' def insert_roomid(self): guidlist = self.pms_dao.get_column_table('guid','remark') roomids = list(range(1,217)) records = [] for i in guidlist: records.append({ 'guid':i[0], 'roomid':random.sample(roomids,1)[0] }) self.pms_dao.update_roomid(records) pass ''' 0更新guest表 ''' def update_guest(self): records = [] usernamelist = self.pms_dao.get_column_table('username','remark') for name in usernamelist: records.append({ 'guestname':name[0] }) self.pms_dao.save_records('guestinfo',records) ''' 0更新roominfo表的viewpoint 南京苏宁威尼斯酒店 ''' def update_roominfo(self): hotelid = self.pms_dao.query_hotelid_from_hotelname(self.hotelname) condition = "(roomid='%s')AND(" + " OR ".join(["baseinfo_id= '"+id[0]+"'" for id in hotelid])+ ")" #获取到房间列表,并逐一循环 roomlist = self.pms_dao.get_column_table('roomid','roominfo') for room in roomlist: print room #对每一个房间 取 remark 后,更新remark commlist = {} viewpointlist = self.pms_dao.get_condition_column('viewpoint','remark',condition%(room)) for item in viewpointlist: for i in item: if i == None: continue dicti = json.loads(i) for k in dicti: if k in commlist: commlist[k] = (commlist[k] + dicti[k]) / 2 else: commlist[k] = dicti[k] remarklist = json.dumps(commlist,ensure_ascii=False) self.pms_dao.updata_roominfo_viewpoint(remarklist,room[0])
class TuniuCatcher(object): def __init__(self): self._city = None self.__ota_info = "途牛" self.tuniu_api_client = TuniuAPIClient() self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) def setCity(self, city): self._city = city def getHotelList(self, city_code): if self._city == None: print "未设置城市,请先使用setCity方法" return hotel_list = [] page_index = 1 page_amount =10000 while page_index <= page_amount - 1: try: page_data = self.tuniu_api_client.get_hotel_list(page_index, city_code) # 接口返回的酒店数不稳定,所以爬取页数以最小数为准 if page_amount > page_data["data"]["total"] / 20: page_amount = page_data["data"]["total"] / 20 print "page_amount=%d"%page_amount hotel_list.extend(page_data["data"]["list"]) print "Page_%d Success"%page_index time.sleep(5) page_index += 1 except: print "Page_%d Fail"%page_index continue return hotel_list def saveHolteList(self, hotel_list): old_location_info = self.hotel_dao.get_locations(self._city) old_baseinfo = list(self.hotel_dao.get_baseinfo(self._city, self.__ota_info)) # 将基础数据中的if_overtime先假设为都已过时 for i in range(0, len(old_baseinfo)): old_baseinfo[i] = list(old_baseinfo[i]) old_baseinfo[i][5] = 1 new_locations = [] new_baseinfo = [] update_baseinfo = [] # 遍历将要保存的数据 for item in hotel_list: location_id = None # 首先检查该酒店是否已经保存在location表中 for location in old_location_info: if item["name"] == location[3]: location_id = location[0] break # 如果没有则插入一条新的记录到location表中 if location_id is None: location_id = uuid.uuid1() trans_location = CoordTransor.gcj02towgs84(lng=float(item["pos"]["lng"]), lat=float(item["pos"]["lat"])) new_locations.append({ "guid": location_id, "x": trans_location[1], "y": trans_location[0], "hotel_name": item["name"], "city": self._city, "address": item["address"] }) # 根据location的id号到baseinfo表中查询 # 如果已经存于表中,则更新该条数据 # 如果没有,则插入一条新的数据 if_exist = False for baseinfo in old_baseinfo: if location_id == baseinfo[2]: if_exist = True baseinfo[1] = item["url"] baseinfo[4] = item["remarkCount"] baseinfo[5] = 0 baseinfo[6] = int(item["remarkCount"]) - int(baseinfo[4]) if int(item["remarkCount"]) - int(baseinfo[4]) > 0 else 0 baseinfo[7] = item["snapshot"] baseinfo[8] = item["id"] break if not if_exist: new_baseinfo.append({ "guid": uuid.uuid1(), "url": item["url"], "location_id": location_id, "OTA": self.__ota_info, "comm_num": item["remarkCount"], "if_overtime": 0, "incre_num": item["remarkCount"], "img": item["snapshot"], "id_in_ota": item["id"] }) for baseinfo in old_baseinfo: update_baseinfo.append({ "guid": baseinfo[0], "url": baseinfo[1], "location_id": baseinfo[2], "OTA": baseinfo[3], "comm_num": baseinfo[4], "if_overtime": baseinfo[5], "incre_num": baseinfo[6], "img": baseinfo[7], "id_in_ota": baseinfo[8] }) print len(new_locations), len(new_baseinfo), len(update_baseinfo) self.hotel_dao.save_locations(new_locations) self.hotel_dao.save_baseinfo(new_baseinfo) self.hotel_dao.update_baseinfo(update_baseinfo)
def __init__(self): self._city = None self.__ota_info = "途牛" self.tuniu_api_client = TuniuAPIClient() self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"])
class HotelDataService(object): def __init__(self): self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) ''' 检查登录用户账号密码的合法性 ''' def check_user(self, user_name, password): userdata = self.hotel_dao.get_user(user_name) result_data = {"baseinfo": [], 'location':{}, 'user':{} } if len(userdata) < 1: return None for line in userdata: result_data['location']['location_id'] = line[0] result_data['location']['x'] = line[1] result_data['location']['y'] = line[2] result_data['location']['hotel_name'] = line[3] result_data['location']['city'] = line[4] result_data['location']['address'] = line[5] result_data['baseinfo'].append( {"id": line[6], "url": line[7], "OTA": line[9], "comm_num": line[10], "img":line[13]} ) result_data['user']['id'] = line[15] result_data['user']['user_name'] = line[16] result_data['user']['password'] = line[17] result_data['user']['corporation'] = line[19] result_data['user']['img'] = line[20] if password != result_data['user']['password']: return None else: return result_data def get_baseinfo_by_location_id(self, location_id_str): location_id_list = location_id_str.split(',') result_data = [] for location_id in location_id_list: baseinfo = self.hotel_dao.get_baseinfo_by_location_id(location_id) ota_list = [] for info in baseinfo: ota_list.append({"id":info[0],"location_id":info[2], "ota":info[3]}) result_data.append(ota_list) return result_data def get_comm_type_score_statics(self, baseinfo_id, ota): result_data = None if ota==u'携程': result_data = self.hotel_dao.get_comm_score_statics(baseinfo_id) elif ota==u'途牛': result_data = self.hotel_dao.get_comm_type_statics(baseinfo_id) return result_data ''' 通过酒店名获取酒店的实体观点 ''' def get_comm_viewpoints(self, hotel_name_str): hotel_name_list = hotel_name_str.split(',') viewpoints = [] for i in range(0, len(hotel_name_list)): viewpoint = self.get_comm_viewpoint(hotel_name_list[i]) viewpoints.append({"hotel_name":hotel_name_list[i], "viewpoint":viewpoint}) return viewpoints ''' 对酒店的评论进行统计,得到酒店实体观点 ''' def get_comm_viewpoint(self, hotel_name): viewpoint_statics = {} comments = self.hotel_dao.get_remarks_by_hotel_name(hotel_name.encode('utf-8')) for comment in comments: try: # 反序列化字符串 viewpoint = json.loads(comment[9]) # 遍历key值 for key in viewpoint: if key in viewpoint_statics: viewpoint_statics[key] = (viewpoint_statics[key] + viewpoint[key])/2 else: viewpoint_statics[key] = viewpoint[key] except: traceback.print_exc() continue return viewpoint_statics ''' 酒店评论形容词统计 ''' def get_comm_adjective_statics(self, baseinfo_id): adjective_statics = {} comments = self.hotel_dao.get_remarks_by_baseinfo_id(baseinfo_id) for comment in comments: # 反序列化字符串 try: adjectives = json.loads(comment[10]) # 遍历key值 for key in adjectives: if key in adjective_statics: adjective_statics[key] = adjective_statics[key] + adjectives[key] else: adjective_statics[key] = adjectives[key] except: continue adjective_statics = sorted(adjective_statics.iteritems(),key=lambda asd:asd[1],reverse=True) return adjective_statics def get_user_flow_to_html(self, hotel_name, base_info_id_str, page, count=10, start_time=None, end_time=None, ring_str=None): baseinfo_id_list = base_info_id_str.split(',') result_data = {} html = "" for baseinfo_id in baseinfo_id_list: if baseinfo_id != "": data = self.get_trace(baseinfo_id, start_time, end_time, ring_str) for key in data["point"]: if data["point"][key]["name"] in hotel_name: continue if key not in result_data: result_data[key] = data["point"][key] else: result_data[key]["value"] += data["point"][key]["value"] result_data = sorted(result_data.iteritems(),key=lambda asd:asd[1]["value"],reverse=True) for hotel in result_data[count*(page-1):count*page]: html += "<tr><td name='name'>%s</td><td name='count'>%s</td><td name='price'>%s</td></tr>" % (hotel[1]["name"], hotel[1]["value"], random.randint(340,500)) return {"pageNum": len(result_data)/count + 1,"html": html} def get_user_trace(self, baseinfo_id_str, start_time=None, end_time=None, ring_str=None): baseinfo_id_list = baseinfo_id_str.split(',') result_data = [] for baseinfo_id in baseinfo_id_list: if baseinfo_id != "": data = self.get_trace(baseinfo_id, start_time, end_time, ring_str) result_data.append(data) return result_data def get_trace(self, baseinfo_id, start_time=None, end_time=None, ring_str=None): result_data = {} result_data["line"] = [] result_data["point"] = {} polygon = None user_list = self.hotel_dao.get_hotel_trace_users(baseinfo_id) # 遍历用户名 for user in user_list: # 获取该用户的评论数据(评论对应的酒店名和地点) remarks = self.hotel_dao.get_remarks_by_username(user[0]) # 生成轨迹线 for i in range(0,len(remarks)-1): if ring_str is not None: if polygon is None: ring = json.loads(ring_str) polygon = Polygon(ring) p1 = Point(remarks[i][14],remarks[i][15]) p2 = Point(remarks[i+1][14],remarks[i+1][15]) # 如果轨迹点不在这个区域内,则不存储 if not polygon.intersects(p1) or not polygon.intersects(p2): continue if remarks[i][15] != remarks[i+1][15]: start_point = {"name":remarks[i][13],"geoCoord":[remarks[i][14],remarks[i][15]]} end_point = {"name":remarks[i+1][13],"geoCoord":[remarks[i+1][14],remarks[i+1][15]]} result_data["line"].append([start_point,end_point]) # 生成轨迹点 for remark in remarks: if ring_str is not None: if polygon is None: ring = json.loads(ring_str) polygon = Polygon(ring) p = Point(remark[14],remark[15]) # 如果该点不在这个区域内,则不存储 if not polygon.intersects(p): continue coord_str = str(remark[15])+","+str(remark[14]) if coord_str not in result_data["point"]: result_data["point"][coord_str] = { "name":remark[13], "geoCoord":[ remark[14], remark[15] ], "value":1 } else: result_data["point"][coord_str]["value"]+=1 return result_data ''' 根据文本获取相关评论 ''' def get_comm_by_text(self, hotel_name, page, text = None, count = 20, ota = None): comments = self.hotel_dao.get_remarks_by_text(hotel_name.encode("utf-8"), text, ota) comments_changed = [] # 遍历评论,加粗其中关于酒店实体的文字 text = text.decode("utf-8") if text != None else None for comm in comments[count*(page-1):count*page]: comm = list(comm) viewpoint = json.loads(comm[9]) for feature in viewpoint: if feature != text: comm[2] = re.sub(feature,'<a title="'+ '%.2f'%viewpoint[feature] + '" data-toggle="tooltip" href="#"><b>' + feature + '</b></a>',comm[2]) else: comm[2] = re.sub(feature,'<a title="'+ '%.2f'%viewpoint[feature] + '" data-toggle="tooltip" href="#"><b> <span style="color:red">' + feature + '</span></b></a>',comm[2]) for adjective in json.loads(comm[9]): comm[2] = re.sub(adjective,'<b>' + adjective + '</b>',comm[2]) comments_changed.append(comm) return {"pageNum": len(comments)/20 + 1,"comments_info": comments_changed} def get_location(self, location_id): data = self.hotel_dao.get_hotel_name_by_location_id(location_id) if len(data) ==1: location = data[0] return { 'data':{ 'location_id': location[0], 'x': location[1], 'y': location[2], 'hotel_name': location[3], 'address': location[5] }, 'status': 200 } return {'status': 0}