class KeywordsHandler(object): def __init__(self): self.dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) self.thu = thulac("-input cs.txt") def extract_keyword(self): sents = [] comm_list = self.dao.get_hotel_comments() # 从语料中读取每一行并切分成子句 for comm in comm_list: sents.extend(normal.get_sentences(comm[2])) print "length of sentences:%d"%len(sents) # 每个子句进行词性判读 pos_sents = [] for sent in sents: pos_sents.append(pseg.cut(sent)) print "length of pos_sents:%d"%len(pos_sents) # 分拣出名词,并进行统计 print "counting" noun_dict = {} for pos_sent in pos_sents: for key,type in pos_sent: if type == "n": if key not in noun_dict: noun_dict[key] = 1 else: noun_dict[key] = noun_dict[key] + 1 a = sorted(noun_dict.iteritems(),key=lambda asd:asd[1],reverse=True) return a def extract_keyword_by_thulac(self): sents = [] comm_list = self.dao.get_hotel_comments() # 从语料中读取每一行并切分成子句 for comm in comm_list: sents.extend(normal.get_sentences(comm[2])) print "length of sentences:%d"%len(sents) # 每个子句进行词性判读 pos_sents = [] for sent in sents: try: pos_sents.append(map(lambda x: x.split("_"), self.thu.cut(sent.encode("utf-8")))) except: print sent continue print "length of pos_sents:%d"%len(pos_sents) # 分拣出名词,并进行统计 print "counting" noun_dict = {} for pos_sent in pos_sents: for word in pos_sent: if word[1] == "n": if word[0] not in noun_dict: noun_dict[word[0]] = 1 else: noun_dict[word[0]] = noun_dict[word[0]] + 1 a = sorted(noun_dict.iteritems(),key=lambda asd:asd[1],reverse=True) return a
def __init__(self): HotelService.__init__(self) # 酒店dao self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) # 途牛dao self.dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) # 自然语言处理 self.hotelNLP = HotelNLP() # 存放列表页数据 self.listPageInfo = [] # 存放酒店详情数据 self.hotelItem = {} # 存放酒店评论数据 self.commList = [] # 存放酒店价格数据 self.priceList = [] self.ifCrawlHotelInfo = True self.__ota_info = "途牛"
# -*- coding:utf-8 -*- __author__ = 'DreamCathcer' import traceback from dao.hotel.TuniuDao import TuniuDAO from setting import local_hotel_setting from service.map.baidu.APIService import BaiduMapAPIService # 配置数据库 dao_setting = local_hotel_setting dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) map_service = BaiduMapAPIService("MviPFAcx5I6f1FkRQlq6iTxc") hotellist = dao.get_hotelinfo() # 酒店地理编码容器 hotel_location = [] # 遍历酒店信息,取出酒店名称进行地理编码 for i in range(0, len(hotellist)): geocoding_info = map_service.doGeocoding(hotellist[i][1]) try: geocoding_info = { "hotel_name": hotellist[i][1], "x": geocoding_info["result"]["location"]["lng"], "y": geocoding_info["result"]["location"]["lat"] }
# -*- coding:utf-8 -*- __author__ = 'DreamCathcer' import traceback from dao.hotel.TuniuDao import TuniuDAO from setting import local_hotel_setting from service.map.baidu.APIService import BaiduMapAPIService # 配置数据库 dao_setting = local_hotel_setting dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) map_service = BaiduMapAPIService("MviPFAcx5I6f1FkRQlq6iTxc") hotellist = dao.get_hotelinfo() # 酒店地理编码容器 hotel_location = [] # 遍历酒店信息,取出酒店名称进行地理编码 for i in range(0, len(hotellist)): geocoding_info = map_service.doGeocoding(hotellist[i][1]) try: geocoding_info = {"hotel_name":hotellist[i][1], "x":geocoding_info["result"]["location"]["lng"], "y":geocoding_info["result"]["location"]["lat"]} except: traceback.print_exc() continue hotel_location.append(geocoding_info) print "%d done"%i
def __init__(self): self.dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) self.thu = thulac("-input cs.txt")
class KeywordsHandler(object): def __init__(self): self.dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) self.thu = thulac("-input cs.txt") def extract_keyword(self): sents = [] comm_list = self.dao.get_hotel_comments() # 从语料中读取每一行并切分成子句 for comm in comm_list: sents.extend(normal.get_sentences(comm[2])) print "length of sentences:%d" % len(sents) # 每个子句进行词性判读 pos_sents = [] for sent in sents: pos_sents.append(pseg.cut(sent)) print "length of pos_sents:%d" % len(pos_sents) # 分拣出名词,并进行统计 print "counting" noun_dict = {} for pos_sent in pos_sents: for key, type in pos_sent: if type == "n": if key not in noun_dict: noun_dict[key] = 1 else: noun_dict[key] = noun_dict[key] + 1 a = sorted(noun_dict.iteritems(), key=lambda asd: asd[1], reverse=True) return a def extract_keyword_by_thulac(self): sents = [] comm_list = self.dao.get_hotel_comments() # 从语料中读取每一行并切分成子句 for comm in comm_list: sents.extend(normal.get_sentences(comm[2])) print "length of sentences:%d" % len(sents) # 每个子句进行词性判读 pos_sents = [] for sent in sents: try: pos_sents.append( map(lambda x: x.split("_"), self.thu.cut(sent.encode("utf-8")))) except: print sent continue print "length of pos_sents:%d" % len(pos_sents) # 分拣出名词,并进行统计 print "counting" noun_dict = {} for pos_sent in pos_sents: for word in pos_sent: if word[1] == "n": if word[0] not in noun_dict: noun_dict[word[0]] = 1 else: noun_dict[word[0]] = noun_dict[word[0]] + 1 a = sorted(noun_dict.iteritems(), key=lambda asd: asd[1], reverse=True) return a
def __init__(self): self.hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) self.dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"])
import uuid import re import traceback from dao.hotel.HotelDAO import HotelDAO from dao.hotel.xiechengdao.xiecheng import xiechengDAO from dao.hotel.TuniuDao import TuniuDAO from setting import local_hotel_setting # 配置数据库 dao_setting = local_hotel_setting hotel_dao = HotelDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) tuniu_dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) xiecheng_dao = xiechengDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) # tuniu_comm = tuniu_dao.get_remarks() # # hotel_comm = [] # i = 0 # for comm in tuniu_comm: # i+=1 # print i # baseinfo = hotel_dao.get_baseinfo_by_hotelname(comm[10].encode('utf-8'), '南京') # for info in baseinfo: # if info[3].encode('utf-8') == '途牛': # hotel_comm.append({ # "guid":uuid.uuid1(),
def __init__(self): self.dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) # 存储床价信息 self.bed = {}
class TuniuDataService(object): def __init__(self): self.dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) # 存储床价信息 self.bed = {} ''' 返回各房型剩余房间数信息 ''' def getbed_roomnum(self, hotelname): roomshengyv = [] hotel_List = hotelname.split(',') hotel1 = [["3-21", "4-14", "4-17", "4-19", "4-21"], ["大床房", "豪华套间", "双人标间", "商务大床房", "精选大床房"], [28, 39, 55, 16, 71], [33, 65, 21, 56, 85], [13, 16, 59, 41, 26], [56, 90, 12, 52, 32], [42, 16, 59, 36, 21]] hotel2 = [["3-21", "4-14", "4-17", "4-19", "4-21"], ["标准双人房", "大床房", "精选大床房", "商务大床房", "豪华家庭房"], [55, 13, 62, 42, 190], [63, 26, 75, 41, 64], [20, 62, 61, 42, 71], [13, 65, 26, 58, 49], [74, 52, 68, 23, 62]] hotel3 = [["3-21", "4-14", "4-17", "4-19", "4-21"], ["精选大床房", "特价房", "豪华套间", "豪华大床房", "大床房"], [28, 56, 23, 91, 42], [62, 52, 12, 36, 55], [100, 56, 23, 81, 20], [63, 59, 74, 52, 12], [63, 69, 65, 21, 35]] hotel4 = [["3-21", "4-14", "4-17", "4-19", "4-21"], ["豪华家庭房", "大床房", "商务大床房", "豪华套间", "精选大床房"], [55, 21, 25, 69, 54], [74, 51, 23, 65, 85], [51, 20, 52, 36, 57], [85, 42, 16, 32, 54], [63, 56, 85, 41, 21]] if len(hotel_List) == 1: roomshengyv.append(hotel_List[0]) roomshengyv.append(hotel1) return roomshengyv elif len(hotel_List) == 2: roomshengyv.append(hotel_List[0]) roomshengyv.append(hotel1) roomshengyv.append(hotel_List[1]) roomshengyv.append(hotel2) return roomshengyv elif len(hotel_List) == 3: roomshengyv.append(hotel_List[0]) roomshengyv.append(hotel1) roomshengyv.append(hotel_List[1]) roomshengyv.append(hotel2) roomshengyv.append(hotel_List[2]) roomshengyv.append(hotel3) return roomshengyv else: roomshengyv.append(hotel_List[0]) roomshengyv.append(hotel1) roomshengyv.append(hotel_List[1]) roomshengyv.append(hotel2) roomshengyv.append(hotel_List[2]) roomshengyv.append(hotel3) roomshengyv.append(hotel_List[3]) roomshengyv.append(hotel4) return roomshengyv ''' 从数据库获取酒店名称所对应房间信息 ''' # 从数据库获取酒店名称所对应房间信息 def getbed_info(self, hotelname): bed = [] hotel_list = hotelname.split(',') for hotel in hotel_list: bedinfo = self.get_bedcommpent(hotel) bed.append(hotel) bed.append(bedinfo) return bed def get_bedcommpent(self, hotel): dataNum = self.getdatanum(hotel) bedInfo = [] bedtype = [] # 所有床型 bedtypeinfo = [] #去掉了重复的床型 # 添加日期对应,价格信息 bedprais = {} bedlist = self.dao._returnbed(hotel) for bedList in bedlist: if bedList[1] in bedtype: pass else: bedtypeinfo.append(bedList[1]) for bedtyp in bedtypeinfo: for bedList in bedlist: if bedList[1] == bedtyp: bedpraise = re.sub('\D', '', str(bedList[7])) bedprais[bedList[8]] = bedpraise if bedList[1] not in bedInfo: if len(bedprais) == dataNum: Bedtypeinfo = sorted(bedprais.iteritems(), key=lambda d: d[0].split('-')) bedInfo.append(bedList[1]) bedInfo.append(Bedtypeinfo) return bedInfo # 获得日期总数 def getdatanum(self, hotel): datanum = [] bedlist = self.dao._returnbed(hotel) for bedList in bedlist: if bedList[8] not in datanum: datanum.append(bedList[8]) return len(datanum) ''' 获取到酒店评论类型的评论数 ''' def get_comm_type_num(self, hotel_name): return self.dao.get_comm_type_num(hotel_name) ''' 酒店评论观点统计 ''' def get_comm_viewpoints(self, hotel_name): hotel_list = hotel_name.split(',') viewpoints = [] for hotel in hotel_list: viewpoint = self.get_comm_viewpoint(hotel) viewpoints.append({"hotel_name": hotel, "viewpoint": viewpoint}) return viewpoints def get_comm_viewpoint(self, hotel_name): viewpoint_statics = {} comments = self.dao.get_hotel_comments_by_name(hotel_name) for comment in comments: # 反序列化字符串 viewpoint = json.loads(comment[8]) # 遍历key值 for key in viewpoint: if key in viewpoint_statics: viewpoint_statics[key] = (viewpoint_statics[key] + viewpoint[key]) / 2 else: viewpoint_statics[key] = viewpoint[key] return viewpoint_statics ''' 酒店评论形容词统计 ''' def get_comm_adjective_statics(self, hotel_name): adjective_statics = {} comments = self.dao.get_hotel_comments_by_name(hotel_name) for comment in comments: # 反序列化字符串 viewpoint = json.loads(comment[9]) # 遍历key值 for key in viewpoint: if key in adjective_statics: adjective_statics[ key] = adjective_statics[key] + viewpoint[key] else: adjective_statics[key] = viewpoint[key] adjective_statics = sorted(adjective_statics.iteritems(), key=lambda asd: asd[1], reverse=True) return adjective_statics ''' 根据文本获取相关评论 ''' def get_comm_by_text(self, hotel_name, text, page, count=20): comments = self.dao.get_hotel_comments_by_text(hotel_name, text) comments_changed = [] # 遍历评论,加粗其中关于酒店实体的文字 for comm in comments[count * (page - 1):count * page]: comm = list(comm) viewpoint = json.loads(comm[8]) for feature in viewpoint: if feature != text: comm[2] = re.sub( feature, '<a title="' + '%.2f' % viewpoint[feature] + '" data-toggle="tooltip" href="#"><b>' + feature + '</b></a>', comm[2]) else: comm[2] = re.sub( feature, '<a title="' + '%.2f' % viewpoint[feature] + '" data-toggle="tooltip" href="#"><b> <span style="color:red">' + feature + '</span></b></a>', comm[2]) for adjective in json.loads(comm[9]): comm[2] = re.sub(adjective, '<b>' + adjective + '</b>', comm[2]) comments_changed.append(comm) return { "pageNum": len(comments) / 20 + 1, "comments_info": comments_changed }
class TuniuDataService(object): def __init__(self): self.dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) # 存储床价信息 self.bed = {} ''' 返回各房型剩余房间数信息 ''' def getbed_roomnum(self,hotelname): roomshengyv = [] hotel_List = hotelname.split(',') hotel1 = [["3-21","4-14","4-17","4-19","4-21"],["大床房","豪华套间","双人标间","商务大床房","精选大床房"],[28,39,55,16,71],[33,65,21,56,85],[13,16,59,41,26],[56,90,12,52,32],[42,16,59,36,21]] hotel2 = [["3-21","4-14","4-17","4-19","4-21"],["标准双人房","大床房","精选大床房","商务大床房","豪华家庭房"],[55,13,62,42,190],[63,26,75,41,64],[20,62,61,42,71],[13,65,26,58,49],[74,52,68,23,62]] hotel3 = [["3-21","4-14","4-17","4-19","4-21"],["精选大床房","特价房","豪华套间","豪华大床房","大床房"],[28,56,23,91,42],[62,52,12,36,55],[100,56,23,81,20],[63,59,74,52,12],[63,69,65,21,35]] hotel4 = [["3-21","4-14","4-17","4-19","4-21"],["豪华家庭房","大床房","商务大床房","豪华套间","精选大床房"],[55,21,25,69,54],[74,51,23,65,85],[51,20,52,36,57],[85,42,16,32,54],[63,56,85,41,21]] if len(hotel_List) == 1: roomshengyv.append(hotel_List[0]) roomshengyv.append(hotel1) return roomshengyv elif len(hotel_List) == 2: roomshengyv.append(hotel_List[0]) roomshengyv.append(hotel1) roomshengyv.append(hotel_List[1]) roomshengyv.append(hotel2) return roomshengyv elif len(hotel_List) == 3: roomshengyv.append(hotel_List[0]) roomshengyv.append(hotel1) roomshengyv.append(hotel_List[1]) roomshengyv.append(hotel2) roomshengyv.append(hotel_List[2]) roomshengyv.append(hotel3) return roomshengyv else: roomshengyv.append(hotel_List[0]) roomshengyv.append(hotel1) roomshengyv.append(hotel_List[1]) roomshengyv.append(hotel2) roomshengyv.append(hotel_List[2]) roomshengyv.append(hotel3) roomshengyv.append(hotel_List[3]) roomshengyv.append(hotel4) return roomshengyv ''' 从数据库获取酒店名称所对应房间信息 ''' # 从数据库获取酒店名称所对应房间信息 def getbed_info(self,hotelname): bed = [] hotel_list = hotelname.split(',') for hotel in hotel_list: bedinfo = self.get_bedcommpent(hotel) bed.append(hotel) bed.append(bedinfo) return bed def get_bedcommpent(self,hotel): dataNum = self.getdatanum(hotel) bedInfo = [] bedtype = [] # 所有床型 bedtypeinfo = [] #去掉了重复的床型 # 添加日期对应,价格信息 bedprais = {} bedlist = self.dao._returnbed(hotel) for bedList in bedlist: if bedList[1] in bedtype: pass else:bedtypeinfo.append(bedList[1]) for bedtyp in bedtypeinfo: for bedList in bedlist: if bedList[1] == bedtyp: bedpraise = re.sub('\D','',str(bedList[7])) bedprais[bedList[8]] = bedpraise if bedList[1] not in bedInfo: if len(bedprais) == dataNum: Bedtypeinfo = sorted(bedprais.iteritems(),key=lambda d:d[0].split('-')) bedInfo.append(bedList[1]) bedInfo.append(Bedtypeinfo) return bedInfo # 获得日期总数 def getdatanum(self,hotel): datanum = [] bedlist = self.dao._returnbed(hotel) for bedList in bedlist: if bedList[8] not in datanum: datanum.append(bedList[8]) return len(datanum) ''' 获取到酒店评论类型的评论数 ''' def get_comm_type_num(self, hotel_name): return self.dao.get_comm_type_num(hotel_name) ''' 酒店评论观点统计 ''' def get_comm_viewpoints(self,hotel_name): hotel_list = hotel_name.split(',') viewpoints = [] for hotel in hotel_list: viewpoint = self.get_comm_viewpoint(hotel) viewpoints.append({"hotel_name":hotel,"viewpoint":viewpoint}) return viewpoints def get_comm_viewpoint(self, hotel_name): viewpoint_statics = {} comments = self.dao.get_hotel_comments_by_name(hotel_name) for comment in comments: # 反序列化字符串 viewpoint = json.loads(comment[8]) # 遍历key值 for key in viewpoint: if key in viewpoint_statics: viewpoint_statics[key] = (viewpoint_statics[key] + viewpoint[key])/2 else: viewpoint_statics[key] = viewpoint[key] return viewpoint_statics ''' 酒店评论形容词统计 ''' def get_comm_adjective_statics(self, hotel_name): adjective_statics = {} comments = self.dao.get_hotel_comments_by_name(hotel_name) for comment in comments: # 反序列化字符串 viewpoint = json.loads(comment[9]) # 遍历key值 for key in viewpoint: if key in adjective_statics: adjective_statics[key] = adjective_statics[key] + viewpoint[key] else: adjective_statics[key] = viewpoint[key] adjective_statics = sorted(adjective_statics.iteritems(),key=lambda asd:asd[1],reverse=True) return adjective_statics ''' 根据文本获取相关评论 ''' def get_comm_by_text(self, hotel_name, text, page, count=20): comments = self.dao.get_hotel_comments_by_text(hotel_name, text) comments_changed = [] # 遍历评论,加粗其中关于酒店实体的文字 for comm in comments[count*(page-1):count*page]: comm = list(comm) viewpoint = json.loads(comm[8]) for feature in viewpoint: if feature != text: comm[2] = re.sub(feature,'<a title="'+ '%.2f'%viewpoint[feature] + '" data-toggle="tooltip" href="#"><b>' + feature + '</b></a>',comm[2]) else: comm[2] = re.sub(feature,'<a title="'+ '%.2f'%viewpoint[feature] + '" data-toggle="tooltip" href="#"><b> <span style="color:red">' + feature + '</span></b></a>',comm[2]) for adjective in json.loads(comm[9]): comm[2] = re.sub(adjective,'<b>' + adjective + '</b>',comm[2]) comments_changed.append(comm) return {"pageNum":len(comments)/20 + 1,"comments_info":comments_changed}