def touch_house(self, param): try: sql = "update houseA set touchTime=now() where id=%(id)s" self._conn.execute(sql, param) self._conn.end("commit") except Exception as e: log.error('-------------------------update house touch_time error-----------------------') # raise e log.error(e)
def write_room(self, param): try: sql = "insert ignore into room (id,title,area,window,direction) VALUES(%(id)s,%(title)s,%(area)s,%(window)s,%(direction)s)" self._conn.execute(sql, param) self._conn.end("commit") except Exception as e: log.error('-------------------------insert room table error-----------------------') # raise e log.error(e)
def analyze(self): try: districtSql = '''insert IGNORE into avgDistrict(ymd,district,bedroom_num,unit_price,total_price,build_area,inside_area,num_house,follow,take_look) select DATE_FORMAT(NOW(),'%Y%m%d') as dd,district,bedroom_num, cast(sum(unit_price)/count(0) as DECIMAL(10,2)) as avgUnitPrice, cast(sum(total_price)/count(0) as DECIMAL(10,2)) as avgTotalPrice, cast(sum(build_area)/count(0) as DECIMAL(10,2)) as avgBuildArea, cast(sum(inside_area)/count(0) as DECIMAL(10,2)) as avgInsideArea, count(0) as numhouse, cast(sum(follow)/count(0) as DECIMAL(10,2)) as avgFollow, cast(sum(take_look)/count(0) as DECIMAL(10,2)) as avgTakeLook from houseA where DATEDIFF(touchTime, NOW())=0 group by district,bedroom_num having count(0)>10 order by district,bedroom_num''' positionSql = '''insert IGNORE into avgPosition(ymd,position,bedroom_num,unit_price,total_price,build_area,inside_area,num_house,follow,take_look) select DATE_FORMAT(NOW(),'%Y%m%d') as dd,position,bedroom_num, cast(sum(unit_price)/count(0) as DECIMAL(10,2)) as avgUnitPrice, cast(sum(total_price)/count(0) as DECIMAL(10,2)) as avgTotalPrice, cast(sum(build_area)/count(0) as DECIMAL(10,2)) as avgBuildArea, cast(sum(inside_area)/count(0) as DECIMAL(10,2)) as avgInsideArea, count(0) as numhouse, cast(sum(follow)/count(0) as DECIMAL(10,2)) as avgFollow, cast(sum(take_look)/count(0) as DECIMAL(10,2)) as avgTakeLook from houseA where DATEDIFF(touchTime, NOW())=0 group by position,bedroom_num having count(0)>10 order by position,bedroom_num''' estateSql = '''insert IGNORE into avgEstate(ymd,housing_estate,bedroom_num,unit_price,total_price,build_area,inside_area,num_house,follow,take_look) select DATE_FORMAT(NOW(),'%Y%m%d') as dd,housing_estate,bedroom_num, cast(sum(unit_price)/count(0) as DECIMAL(10,2)) as avgUnitPrice, cast(sum(total_price)/count(0) as DECIMAL(10,2)) as avgTotalPrice, cast(sum(build_area)/count(0) as DECIMAL(10,2)) as avgBuildArea, cast(sum(inside_area)/count(0) as DECIMAL(10,2)) as avgInsideArea, count(0) as numhouse, cast(sum(follow)/count(0) as DECIMAL(10,2)) as avgFollow, cast(sum(take_look)/count(0) as DECIMAL(10,2)) as avgTakeLook from houseA where DATEDIFF(touchTime, NOW())=0 group by housing_estate,bedroom_num having count(0)>10 order by housing_estate,bedroom_num''' # insert IGNORE into maininfo(ymd, num_house, avg_total_price, avg_unit_price, num_priceup, num_pricedown) mainSql = '''insert IGNORE into maininfo(ymd, num_house,avg_unit_price,avg_total_price,num_priceup,num_pricedown) select DATE_FORMAT(NOW(),'%Y%m%d') as ymd, numhouse,unit_price,total_price,numup,numdown from (select count(0) as numhouse,sum(unit_price)/count(0) as unit_price,sum(total_price)/count(0) as total_price, 1 as col from houseA where DATEDIFF(touchTime,NOW())=0) as h inner join (select count(0) as numup, 1 as col from pricelog where pricetrend='up' and DATEDIFF(createDate,NOW())=0) as up on h.col=up.col inner join (select count(0) as numdown,1 as col from pricelog where pricetrend='down' and DATEDIFF(createDate,NOW())=0) as down on h.col=down.col''' cnt = self._conn.execute(districtSql) log.debug('行政区生成统计数据条数:' + repr(cnt)) cnt = self._conn.execute(positionSql) log.debug('区域生成统计数据条数:' + repr(cnt)) cnt = self._conn.execute(estateSql) log.debug('小区生成统计数据条数:' + repr(cnt)) cnt = self._conn.execute(mainSql) log.debug('mainInfo data generated successful') self._conn.end("commit") except Exception as e: log.error('===============analyze data error========') log.error(e) raise e
def write_house(self, param): try: sql = '''insert ignore into houseA (id,title,housing_estate,position,district,city,bedroom_num,livingroom_num,bathroom_num,build_area,inside_area,unit_price,total_price,follow, take_look,pub_date,build_year, lastdeal_date,yearlimit,use_type,use_year,ownership,fitment,elevator_num,house_num,structure,tag,build_structure,floor,direction,build_type) VALUES(%(id)s,%(title)s,%(housing_estate)s, %(position)s,%(district)s,%(city)s,%(bedroom_num)s,%(livingroom_num)s,%(bathroom_num)s,%(build_area)s,%(inside_area)s, %(unit_price)s,%(total_price)s,%(follow)s,%(take_look)s,%(pub_date)s,%(build_year)s,%(lastdeal_date)s,%(yearlimit)s,%(use_type)s,%(use_year)s,%(ownership)s, %(fitment)s,%(elevator_num)s,%(house_num)s,%(structure)s,%(tag)s,%(build_structure)s,%(floor)s,%(direction)s,%(build_type)s)''' self._conn.execute(sql, param) self._conn.end("commit") except Exception as e: log.error('=========================insert house table error===================') # raise e log.error(e)
def update_houseprice(self, houseparam, pricelogparam): try: sql = "insert ignore into pricelog (id,unit_price,total_price,unit_change,total_change,pricetrend,pricetype,thedate,createDate) " \ "VALUES(%(id)s,%(unit_price)s,%(total_price)s,%(unit_change)s,%(total_change)s,%(pricetrend)s,%(pricetype)s,%(thedate)s,now())" self._conn.execute(sql, pricelogparam) sql = "update houseA set unit_price=%(unit_price)s,total_price=%(total_price)s,build_area=%(build_area)s,updateTime=now() where id=%(id)s" self._conn.execute(sql, houseparam) self._conn.end("commit") except Exception as e: log.error('-------------------------update house updatetime error-----------------------') # raise e log.error(e)
def collect(self, param): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' } patternRoom = re.compile(r'\D*(\d+)室(\d+)厅(\d*)厨*(?P<bath>\d*)卫*\s*') page_max = 100 cnt = 0 district = param['district'] l = param['l'] for i in range(1, int(page_max) + 1): pageUrl = 'https://cd.lianjia.com/ershoufang/' + district + '/l' + l + '/pg' + str( i) if i == 1: pageUrl = 'https://cd.lianjia.com/ershoufang/' + district + '/l' + l + '/' startTime = time.time() log.info('开始采集: ' + pageUrl) res = requests.get(pageUrl, headers=headers) soup = BeautifulSoup(res.text, 'html.parser') ul = soup.find('ul', class_='sellListContent') if ul is None: break li_max = ul.find_all('li') for li in li_max: try: cnt += 1 house_param = {} house_param['title'] = li.find('div', class_='title').text houseUrl = li.find('div', class_='title').find('a').attrs['href'] houseId = re.sub("\D", "", houseUrl) house_param['id'] = houseId log.info(district + '-' + l + '-' + str(cnt) + ': ' + houseId + ':' + house_param['title'] + ' ' + houseUrl) # --------------------------------------------------------# 价钱 totalprice = li.find('div', class_='totalPrice').text totalprice = decimal.Decimal( re.sub(r"[^\d\.]", "", totalprice)) house_param['total_price'] = totalprice unitprice = li.find('div', class_='unitPrice').text unitprice = decimal.Decimal( re.sub(r"[^\d\.]", "", unitprice)) house_param['unit_price'] = unitprice # 东洪广厦 | 3室1厅 | 87.82平米 | 东南 | 精装 | 有电梯 # 加国枫韵 | 车位 | 109平米 | 西南 | 无电梯 (车位) # 浣花里100号 | 叠拼别墅 | 6室2厅 | 235.66平米 | 东 | 其他 | 有电梯 (别墅) content = li.find('div', class_='houseInfo').text content = content.split("|") house_param['housing_estate'] = content[0] buildArea = re.sub( r"[^\d\.]", "", content[2] if len(content) < 7 else content[3]) house_param['build_area'] = buildArea if re.match( r"\d+\.*\d*", buildArea) else 0 # --------------------------------------------------------# # {'id': 106101546082, 'title': '华韵天府双卫套四,朝西南,对创意山', 'unit_price': Decimal('23521.00'), 'total_price': Decimal('207.00'), 'updateTime': datetime.datetime(2018, 12, 11, 16, 34, 9)} house = self.get_house(houseId) if house: touchParam = {'id': houseId} self.touch_house(touchParam) if house['total_price'] != totalprice: log.debug('price changed, ID: ' + houseId + ':' + house_param['title']) param1 = { 'id': houseId, 'unit_price': unitprice, 'total_price': totalprice, 'build_area': house_param['build_area'] } priceTrend = 'up' if unitprice > house[ 'unit_price'] else 'down' param2 = { 'id': houseId, 'unit_price': house['unit_price'], 'total_price': house['total_price'], 'unit_change': unitprice - house['unit_price'], 'total_change': totalprice - house['total_price'], 'pricetrend': priceTrend, 'thedate': house['updateTime'], 'pricetype': 'quote' } self.update_houseprice(param1, param2) continue house_param['city'] = '成都' # --------------------------------------------------------# # 位置 水清沟 position = li.find('div', class_='positionInfo').find('a').text house_param['position'] = position # --------------------------------------------------------# # 57人关注 / 共13次带看 / 6个月以前发布 follow = li.find('div', class_='followInfo').text follow = follow.split("/") house_param['follow'] = re.sub("\D", "", follow[0]) house_param['take_look'] = re.sub("\D", "", follow[1]) res = requests.get(houseUrl, headers=headers) soup = BeautifulSoup(res.text, 'html.parser') baseinfo = soup.find('div', class_='base') baselis = baseinfo.find_all('li') if len(baselis) > 3: matchRoom = patternRoom.match(baselis[0].text) bedroom_num = decimal.Decimal(matchRoom.group(1)) house_param[ 'bedroom_num'] = bedroom_num if bedroom_num < 6 else 5 house_param['livingroom_num'] = matchRoom.group(2) numBathroom = matchRoom.group('bath') house_param[ 'bathroom_num'] = 1 if numBathroom == '' else numBathroom else: house_param['bedroom_num'] = 0 house_param['livingroom_num'] = 0 house_param['bathroom_num'] = 0 if len(baselis) == 12: insideArea = re.sub(r"[^\d\.]", "", baselis[4].text) house_param['inside_area'] = insideArea if re.match( r"\d+\.*\d*", insideArea) else 0 house_param['fitment'] = baselis[8].text.replace( '装修情况', '') useyear = re.sub(r"\D", "", baselis[11].text) house_param['use_year'] = useyear if re.match( r'\d+', useyear) else 0 house_param['structure'] = baselis[3].text.replace( '户型结构', '') house_param['build_structure'] = baselis[ 7].text.replace('建筑结构', '') house_param['floor'] = baselis[1].text.replace( '所在楼层', '') house_param['elevator_num'] = 0 if baselis[10].text.replace('配备电梯', '') == '无' \ else self.getNumber(re.sub(r'梯.+', '', baselis[9].text.replace('梯户比例', ''))) house_param['house_num'] = self.getNumber( re.sub(r'.+梯', '', baselis[9].text.replace('户', ''))) house_param['direction'] = baselis[6].text.replace( '房屋朝向', '') house_param['build_type'] = baselis[5].text.replace( '建筑类型', '') elif len(baselis) == 3: # 车位 insideArea = re.sub(r"[^\d\.]", "", baselis[1].text) house_param['inside_area'] = insideArea if re.match( r"\d+\.*\d*", insideArea) else 0 house_param['fitment'] = '车位' house_param['use_year'] = 0 house_param['structure'] = '' house_param['build_structure'] = '' house_param['floor'] = baselis[0].text.replace( '所在楼层', '') house_param['elevator_num'] = 0 house_param['house_num'] = 1 house_param['direction'] = baselis[2].text.replace( '房屋朝向', '') house_param['build_type'] = '车位' else: # 别墅 insideArea = re.sub(r"[^\d\.]", "", baselis[3].text) house_param['inside_area'] = insideArea if re.match( r"\d+\.*\d*", insideArea) else 0 house_param['fitment'] = baselis[6].text.replace( '装修情况', '') useyear = re.sub(r"\D", "", baselis[8].text) house_param['use_year'] = useyear if re.match( r'\d+', useyear) else 0 house_param['structure'] = baselis[7].text.replace( '别墅类型', '') house_param['build_structure'] = baselis[ 5].text.replace('建筑结构', '') house_param['floor'] = baselis[1].text.replace( '所在楼层', '') house_param['elevator_num'] = 0 house_param['house_num'] = 1 house_param['direction'] = baselis[4].text.replace( '房屋朝向', '') house_param['build_type'] = '别墅' transinfo = soup.find('div', class_='transaction') translis = transinfo.find_all('div')[1].ul.find_all('li') house_param['pub_date'] = translis[0].find_all( 'span')[1].text house_param['ownership'] = translis[1].find_all( 'span')[1].text lastdealtime = translis[2].find_all('span')[1].text house_param['lastdeal_date'] = lastdealtime if re.match( r'\d{4}-\d{2}-\d{2}', lastdealtime) else '0000-00-00' house_param['use_type'] = translis[3].find_all( 'span')[1].text house_param['yearlimit'] = translis[4].find_all( 'span')[1].text tagsinfo = soup.find('div', class_='tags') house_param[ 'tag'] = '' if tagsinfo is None else tagsinfo.find_all( 'div')[1].text buildyear = soup.find('div', class_='houseInfo').find( 'div', class_='area').find('div', class_='subInfo').text buildyear = re.sub(r"\D", "", buildyear) house_param['build_year'] = buildyear if re.match( r'\d{4}', buildyear) else 0 house_param['district'] = soup.find( 'div', class_='areaName').find('a').text roominfo = soup.find('div', attrs={'id': 'infoList'}) roomlis = [] if roominfo is None else roominfo.find_all( 'div', class_='row') if house_param['inside_area'] == 0: # 修正部分数据没有套内面积的情况 insideArea = 0 for roomli in roomlis: rlist = roomli.find_all('div') if rlist is None: log.info('rlist is none,::: ' + houseUrl) break insideArea += decimal.Decimal( re.sub(r"[^\d\.]", "", rlist[1].text)) house_param['inside_area'] = insideArea self.write_house(house_param) for roomli in roomlis: room_param = {} room_param['id'] = house_param['id'] rlist = roomli.find_all('div') if rlist is None: log.info('rlist is none,' + houseUrl) break room_param['title'] = rlist[0].text room_param['area'] = re.sub(r"[^\d\.]", "", rlist[1].text) room_param['direction'] = rlist[2].text room_param['window'] = rlist[3].text self.write_room(room_param) except Exception as e: log.error('###########{0} {1}\n{2}'.format( house_param['title'], houseUrl, e)) # raise e endTime = time.time() shortestWaitTime = 12 if endTime - startTime < shortestWaitTime: time.sleep(int(shortestWaitTime - (endTime - startTime))) log.info(pageUrl + "总条数:" + str(cnt))