def get_comm_info(self, comm_list, all_page_url): for i in comm_list: try: comm = Comm('中安房') comm.city = '合肥' comm.district_name = re.search('zaf-nowrap.*?>(.*?)<', i, re.S | re.M).group(1).strip() trade_date = re.search('zaf-fblue">(.*?)<', i, re.S | re.M).group(1).strip() if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = datetime.datetime(y, m, d) total_price = re.search('list-right-data.*?<span.*?>(.*?)<', i, re.S | re.M).group(1).strip() comm.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 info = re.search('list-details-area.*?<span.*?>(.*?)<', i, re.S | re.M).group(1).strip() area = info.split(' ')[0].replace('㎡', '') if area: area = float(area) comm.area = round(area, 2) try: room_type = info.split(' ')[1] except Exception as e: room_type = None try: comm.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: comm.room = 0 try: comm.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: comm.hall = None try: comm.toilet = int(re.search('(\d)卫', room_type, re.S | re.M).group(1)) except Exception as e: comm.toilet = None try: avg_price = info.split(' ')[2] comm.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1)) except Exception as e: comm.avg_price = None info_2 = re.search('list-details-area.*?<span.*?<span>(.*?)<', i, re.S | re.M).group(1).strip() comm.direction = info_2.split(' ')[0] try: comm.fitment = info_2.split(' ')[1] except Exception as e: comm.fitment = None info_3 = re.search('list-details-address1.*?<span>(.*?)<', i, re.S | re.M).group(1).strip() comm.region = info_3.split(' ')[0].strip() comm.insert_db() except Exception as e: log.error('解析错误,source={},url="{}",e="{}"'.format('中安房', all_page_url, e))
def new_deal_price(self): for i in collection_new.find(no_cursor_timeout=True): print(collection_new.database.client.address[0]) if 'fj_flag' in i: if i['fj_flag'] == 1: deal_price = Comm(self.new_source) deal_price.city = i['fj_city'] deal_price.region = i['fj_region'] deal_price.district_name = i['fj_name'] deal_price.avg_price = float(i["CJDJ"]) # deal_price.total_price = float(i["CJJE"]) * 10000 deal_price.trade_date = i['CJRQ'] deal_price.area = float(i['JZMJ']) deal_price.room_num = i['SH'] deal_price.total_price = float(i['JZMJ']) * float( i["CJDJ"]) try: room = re.search('(.)室', i['FX'], re.S | re.M).group(1) deal_price.room = check_room(room) except Exception as e: print('找不到室,FX={}, e={}'.format(i['FX'], e)) try: hall = re.search('(.)厅', i['FX'], re.S | re.M).group(1) deal_price.hall = check_room(hall) except Exception as e: print('找不到厅,FX={}, e={}'.format(i['FX'], e)) is_success = deal_price.insert_db()
def city_info(self, index_url, city): for i in range(1, 101): index_url_ = index_url + 'i3' + str(i) + '/' if i == 1: index_url_ = index_url try: response = requests.get(index_url_, headers=self.headers) html = response.text try: city_real = re.search('city = "(.*?)"', html, re.S | re.M).group(1) if city != city_real: break house_num = re.search('class="org">(.*?)</b>', html, re.S | re.M).group(1) if house_num == '0': break comm_info_paper_list = re.findall('class="info rel floatr".*?</dd>', html, re.S | re.M) for comm_info_paper in comm_info_paper_list: comm = Comm('房天下') comm.city = city comm.district_name = re.search('<a.*?>(.*?)<', comm_info_paper, re.S | re.M).group(1).strip() if '�' in comm.district_name: log.error('网页出现繁体字, url={}'.format(index_url_)) continue comm.direction = re.search('class="mt18">(.*?)<', comm_info_paper, re.S | re.M).group(1) try: comm.height = int(re.search('共(.*?)层', comm_info_paper, re.S | re.M).group(1)) except Exception as e: comm.height = None comm.region = re.search('class="mt15">.*?<a.*?chengjiao.*?>(.*?)<', comm_info_paper, re.S | re.M).group( 1) total_price = re.search('class="price">(.*?)<', comm_info_paper, re.S | re.M).group(1) if '*' in total_price: continue comm.total_price = int(total_price) * 10000 comm.room = int(re.search('(\d+)室', comm.district_name, re.S | re.M).group(1)) comm.hall = int(re.search('(\d+)厅', comm.district_name, re.S | re.M).group(1)) try: comm.area = float(re.search('(\d+\.\d+)平米', comm.district_name, re.S | re.M).group(1)) except Exception as e: comm.area = None trade_date = re.search('class="time".*?>(.*?)<', comm_info_paper, re.S | re.M).group(1) t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = datetime.datetime(y, m, d) try: comm.avg_price = int(comm.total_price / comm.area) except Exception as e: comm.avg_price = None comm.insert_db() except Exception as e: log.error('解析错误,source="{}",html="{}",e="{}"'.format('房天下', html, e)) except Exception as e: log.error('请求错误,source="{}",url="{}",e="{}"'.format('房天下', index_url_, e))
def info_parse(self,ro_html,co_name,region,city_name): room_list = ro_html.xpath("//ul[@class='pList zu']/li") for room in room_list: try: ro = Comm(source) ro.city = city_name ro.district_name = co_name ro.region = region room_type = room.xpath(".//p[@class='sTit']/strong/text()")[0] try: ro.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: ro.room = None try: ro.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: ro.hall = None total_price = room.xpath(".//div[@class='jiage']/strong/text()")[0] ro.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000 avg_price = room.xpath(".//div[@class='jiage']/p/text()")[0] ro.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1)) info = room.xpath(".//div/p[2]/text()")[0] area = re.search('·(.*?)平米',info).group(1) area = float(area) ro.area = round(area, 2) direction = re.search('平米 · (.*)',info).group(1) ro.direction = direction.strip() trade_date = room.xpath(".//div/p[3]/text()")[0] trade_date = trade_date.strip() if trade_date: t = time.strptime(trade_date, "成交日期:%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday ro.trade_date = datetime.datetime(y, m, d) ro.insert_db() except Exception as e: log.error("房间信息提取错误{}".format(e))
def room(self, co_list, city_name): for co in co_list: try: co_name = co.xpath("./div[1]/a/text()")[0] co_url = "http:" + co.xpath("./div[1]/a/@href")[0] region = co.xpath("./div[3]/span[1]/a[1]/text()")[0] addr = co.xpath("./div[3]/span[3]/@title")[0] detail = requests.get(co_url, headers=self.headers) html = etree.HTML(detail.text) room_url = "http:" + html.xpath("//div[@class='tab-toolbar pr']//li/a/@href")[-1] page_index = requests.get(room_url, headers=self.headers) except: continue if re.search('共(\d+)页', page_index.text): page_num = re.search('共(\d+)页', page_index.text).group(1) else: log.info('小区无相关数据') continue for i in range(1, int(page_num) + 1): url = re.sub('#.*', 'n', room_url) + str(i) while True: try: res = requests.get(url, headers=self.headers) break except: continue con = res.text room_html = etree.HTML(con) room_list = room_html.xpath("//div[@class='right-information']") for m in room_list: try: room = Comm(source) room.district_name = co_name room.city = city_name room.region = region room_type = m.xpath("./h3/span[2]/text()")[0] try: room.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: room.room = None try: room.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: room.hall = None size = m.xpath("./h3/span[3]/text()")[0] area = size.replace('平米', '') if area: area = float(area) room.area = round(area, 2) total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0] room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000 avg_price = m.xpath(".//div[@class='size fs14']/text()")[0] room.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1)) try: room.fitment = m.xpath(".//div[@class='t1 fs14']/text()[3]")[0] room.direction = m.xpath(".//div[@class='t1 fs14']/text()[2]")[0] # room.use = m.xpath(".//div[@class='t1 fs14']/text()[1]")[0] except: room.fitment = None room.direction = None # room.use = None floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0] try: floor = re.search('(.*?)/', floor_info).group(1) room.floor = int(re.search('\d+',floor).group(0)) except Exception as e: room.floor = None try: room.height = int(re.search('.*?/(\d+)层', floor_info).group(1)) except: room.height = None trade_date = m.xpath(".//div[@class='date']/text()")[0] if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday room.trade_date = datetime.datetime(y, m, d) room.insert_db() except Exception as e: log.error('房屋信息提取失败{}'.format(e))
def start_crawler(self): page = self.get_all_page() for i in range(1, page): url = 'http://www.taiwu.com/building/cp' + str(i) + '/' while True: try: res = requests.get(url) if res.status_code == 200: break except Exception as e: print('请求出错', e) # print(res.content.decode()) all_info = re.search('<ul class="fang-list">.*?</ul>', res.content.decode(), re.S | re.M).group(0) for k in re.findall('<li>.*?</li>', all_info, re.S | re.M): source = '太屋网' city = '上海' area = re.search( '<div class="adds">.*?<a href="/building/.*?/">(.*?)</a>', k, re.S | re.M).group(1) # 区域 building_id = re.search('<a href="/building/(.*?)/', k, re.S | re.M).group(1) detail_url = "http://www.taiwu.com/Building/GetHouseExchange/" payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"buildingId\"\r\n\r\n" + building_id + "\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"pageIndex\"\r\n\r\n1\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"pageSize\"\r\n\r\n5000\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--" headers = { 'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW", 'Cache-Control': "no-cache", } while True: try: response = requests.request("POST", detail_url, data=payload, headers=headers) if res.status_code == 200: break except Exception as e: print('请求出错', e) try: result_json = response.json() data_list = result_json['data'] except Exception as e: print(e) continue for j in data_list: c = Comm(source) c.city = city c.region = area c.room = j['RoomCount'] c.hall = j['HollCount'] c.district_name = j['BuildingName'] c.area = j['BldArea'] trade_date = j['ExDate'] trade_date_ = int(re.search('(\d+)', trade_date).group(1)) if trade_date_: t = time.localtime(int(trade_date_ / 1000)) y = t.tm_year m = t.tm_mon d = t.tm_mday c.trade_date = datetime.datetime(y, m, d) c.total_price = j['ExPrice'] c.insert_db()
def crawler(self, city_url, city): res = requests.get(city_url, headers=self.headers) con = etree.HTML(res.text) last_page = con.xpath("//a[@class='down_page']/@href")[1] page_num = re.search('\d+', last_page).group(0) for i in range(1, int(page_num) + 1): page_url = city_url + "/PG" + str(i) page_res = requests.get(page_url, headers=self.headers) page_con = etree.HTML(page_res.text) temp = page_con.xpath("//h1/a/@href") for temp_url in temp: try: com = Comm(source) comm_url = city + temp_url while True: try: co_res = requests.get(comm_url, headers=self.headers, timeout=10) break except: continue time.sleep(2) co_con = etree.HTML(co_res.text) com.city = co_con.xpath("//div/a[@class='show']/text()")[0] region = co_con.xpath("//section/p/a/text()")[-1] com.region = region com.district_name = co_con.xpath("//cite/span/text()")[0] info = co_con.xpath("//table/tbody/tr") for tag in info: size = tag.xpath("./td[2]/text()")[0] area = size.replace('㎡', '') area = float(area) com.area = round(area, 2) avg_price = tag.xpath("./td[3]/text()")[0] com.avg_price = int( re.search('(\d+)', avg_price, re.S | re.M).group(1)) total_price = tag.xpath("./td/span/text()")[0] com.total_price = int( re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 trade_date = tag.xpath("./td/text()")[-2] if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday com.trade_date = datetime.datetime(y, m, d) room_type = tag.xpath("./td//p/a/text()")[0] try: com.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: com.room = None try: com.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: com.hall = None floor = tag.xpath("./td//p/span/text()")[0] com.floor = int(re.search('(\d+)层', floor).group(1)) com.direction = re.search('层 (.*?)', floor).group(1) com.insert_db() except Exception as e: log.error("{}小区信息提取错误".format(comm_url))
def get_page_url(self, page_url, city, area_): response = requests.get(page_url, headers=self.headers, proxies=self.proxy) html = response.text comm_html_list = re.findall('<li class=" clearfix">.*?</li>', html, re.S | re.M) for i in comm_html_list: try: comm = Comm('Q房网') comm.city = city.strip() comm.region = area_.strip() comm.district_name = re.search('house-title">.*?<a.*?>(.*?)<', i, re.S | re.M).group(1).strip() comm.direction = re.search( 'class="house-about clearfix".*?showKeyword">(.*?)<', i, re.S | re.M).group(1).strip() try: comm.height = int( re.search( 'class="house-about clearfix".*?showKeyword">.*?<span.*?<span>.*?/(.*?)<', i, re.S | re.M).group(1).strip()) except Exception as e: comm.height = None total_price = re.search('class="show-price".*?span.*?>(.*?)<', i, re.S | re.M).group(1).strip() comm.total_price = int(total_price) * 10000 avg_price = re.search('class="show-price".*?<p.*?>(.*?)<', i, re.S | re.M).group(1).strip() comm.avg_price = int(re.search('(\d+)', avg_price).group(1)) trade_date = re.search( 'class="show-price concluded".*?span.*?>(.*?)<', i, re.S | re.M).group(1).strip() if trade_date: t = time.strptime(trade_date, "%Y.%m.%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = datetime.datetime(y, m, d) room_type = re.search('house-title">.*?<a.*?>.*? (.*?) ', i, re.S | re.M).group(1).strip() try: comm.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: comm.room = None try: comm.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: comm.hall = None area = re.search('house-title">.*?<a.*?>.*? .*? (.*?平米)', i, re.S | re.M).group(1).strip() area = area.replace('㎡', '').replace('平米', '') if area: area = float(area) comm.area = round(area, 2) comm.insert_db() except Exception as e: log.error('解析错误,source="{}",html="{}",e="{}"'.format( 'Q房网', i, e))
def get_comm_detail(self, comm_url, region, city): comm = Comm('购房网') comm.url = comm_url comm.region = region.strip() comm.city = city try: response = requests.get(url=comm_url, headers=self.headers, proxies=next(p)) except Exception as e: log.error('请求错误,source="{}",url="{}",e="{}"'.format( '购房网', comm_url, e)) return html = response.text comm.district_name = re.search('title fl.*?<h1>(.*?)</h1>', html, re.S | re.M).group(1).strip() comm_info_html = re.search('<ul class="lscjlist">.*?</ul>', html, re.S | re.M).group() comm_info_list = re.findall('<li>(.*?)</li>', comm_info_html, re.S | re.M) if not comm_info_list: log.info('source={}, 此小区没有数据,url="{}"'.format('购房网', comm_url)) for i in comm_info_list: trade_date = re.search('<span>(.*?)</span>', i, re.S | re.M).group(1).strip() if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = datetime.datetime(y, m, d) room_type = re.search('<span>.*?<span>(.*?)</span>', i, re.S | re.M).group(1).strip() try: comm.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) comm.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: comm.room = None comm.hall = None area = re.search('<span>.*?<span>.*?<span>(.*?)</span>', i, re.S | re.M).group(1).strip().replace('㎡', '').replace( '平', '') if area: area = float(area) comm.area = round(area, 2) try: height = re.search( '<span>.*?<span>.*?<span>.*?<span>.*?/(.*?)</span>', i, re.S | re.M).group(1).strip() comm.height = int(re.search('(\d+)', height).group(1)) except Exception as e: comm.height = None comm.fitment = re.search( '<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>', i, re.S | re.M).group(1).strip() comm.direction = re.search( '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>', i, re.S | re.M).group(1).strip() avg_price = re.search( '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>', i, re.S | re.M).group(1) comm.avg_price = int( re.search('(\d+)', avg_price, re.S | re.M).group(1)) total_price = re.search( '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span.*?>(.*?)</span>', i, re.S | re.M).group(1) comm.total_price = int( re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 comm.insert_db()
def comm_detail(self, comm_url_list, city): for comm_url in comm_url_list[1:]: try: com_url = city.replace('/xiaoqu/', comm_url) statecode = re.search('xq-(.*)', comm_url).group(1) code = statecode.upper() comm_detail_url = 'http://sh.centanet.com/apipost/GetDealRecord?estateCode=' + code + '&posttype=S&pageindex=1&pagesize=10000' com_res = requests.get(com_url, headers=self.headers) res = requests.get(comm_detail_url, headers=self.headers) time.sleep(2) html = etree.HTML(com_res.text) data_dict = json.loads(res.text) district_name = html.xpath("//div/h3/text()")[0] city_name = html.xpath("//div[@class='idx-city']/text()")[0] region = html.xpath("//a[@class='f000']/text()")[0] for data in data_dict["result"]: try: co = Comm(source) co.district_name = district_name.strip() co.region = region co.city = city_name try: room_type = data["houseType"] co.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) co.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: log.error('roomtype为空'.format(e)) area = data['areaSize'].replace('平', '') if area: area = float(area) co.area = round(area, 2) co.direction = data['direction'] trade_date = '20' + data['dealTime'] if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday co.trade_date = datetime.datetime(y, m, d) total_price = data['dealPrice'] co.total_price = int( re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 avg_price = data['unitPrice'] try: co.avg_price = int( re.search('(\d+)', avg_price, re.S | re.M).group(1)) except Exception as e: co.avg_price = None co.insert_db() except Exception as e: log.error('解析失败{}'.format(e)) except Exception as e: log.error("小区成交信息错误{}".format(e))
def get_city_info(self, city_dict): for city in city_dict: city_url = city_dict[city] + 'chengjiao/' try: response = requests.get(city_url, headers=self.headers) html = response.text area_html = re.search('data-role="ershoufang".*?地铁', html, re.S | re.M).group() area_list_str = re.findall('<a.*?</a>', area_html, re.S | re.M) for area_i in area_list_str: if 'ershoufang' in area_i: continue area_url = re.search('href="(.*?)"', area_i, re.S | re.M).group(1) area = re.search('<a.*?>(.*?)<', area_i, re.S | re.M).group(1) for i in range(1, 101): city_url_ = city_url.replace( '/chengjiao/', '') + area_url + 'pg' + str(i) try: result = requests.get(city_url_, headers=self.headers) content = result.text comm_str_list = re.findall( 'class="info".*?</div></div></li>', content, re.S | re.M) for i in comm_str_list: comm = Comm('链家在线') comm.region = area.strip() comm.city = city.strip() comm.district_name = re.search( 'target="_blank">(.*?)<', i, re.S | re.M).group(1).strip() comm.direction = re.search( 'class="houseIcon"></span>(.*?) \|', i, re.S | re.M).group(1).strip() try: comm.fitment = re.search( 'class="houseIcon"></span>.*? \|(.*?)\| ', i, re.S | re.M).group(1).strip() except Exception as e: comm.fitment = None try: height = re.search( 'class="positionIcon"></span>.*?\((.*?)\)', i, re.S | re.M).group(1).strip() comm.height = int( re.search('(\d+)', height, re.S | re.M).group(1)) except Exception as e: comm.height = None total_price = re.search( "class='number'>(.*?)<", i, re.S | re.M).group(1).strip() if "*" in total_price: continue comm.total_price = int( re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 room_type = re.search( 'arget="_blank">.*? (.*?) ', i, re.S | re.M).group(1).strip() try: comm.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: comm.room = 0 try: comm.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: comm.hall = None area_ = re.search( 'target="_blank">.*? .*? (.*?平米)', i, re.S | re.M).group(1).strip() if area_: area_ = area_.replace('㎡', '').replace( '平米', '') try: area_ = float(area_) comm.area = round(area_, 2) except Exception as e: comm.area = None trade_date = re.search( 'dealDate">(.*?)<', i, re.S | re.M).group(1).strip() if trade_date: t = time.strptime(trade_date, "%Y.%m.%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = datetime.datetime( y, m, d) try: comm.avg_price = int(i['total_price'] / i['area']) except Exception as e: comm.avg_price = None comm.insert_db() except Exception as e: log.error( '解析错误,source="{}",html="{}",e="{}"'.format( '链家在线', html, e)) except Exception as e: log.error('请求错误,source="{}",url="{}",e="{}"'.format( '链家在线', city_url, e))