def get_comm_info(self, comm_url_list): for i in comm_url_list: try: code = i.split(',') comm_url = 'http://www.tmsf.com/newhouse/property_' + code[ 0] + '_' + code[1] + '_info.htm' comm = Comm(co_index) comm.co_name = 'buidname.*?>(.*?)<' comm.co_address = '--位置行--.*?<span.*?title="(.*?)"' comm.co_build_type = '建筑形式:<.*?>(.*?)<' comm.co_develops = '项目公司:<.*?>(.*?)<' comm.co_volumetric = '容 积 率:</span>(.*?)<' comm.co_green = '绿 化 率:</span>(.*?)<' comm.co_size = '占地面积:</span>(.*?)<' comm.co_build_size = '总建筑面积:</span>(.*?)<' comm.co_all_house = '总户数:</span>(.*?)<' comm.co_id = 'info" href="/newhouse/property_(.*?)_info' p = ProducerListUrl(page_url=comm_url, request_type='get', encode='utf-8', analyzer_rules_dict=comm.to_dict(), current_url_rule='一房一价<.*?href="(.*?)"', analyzer_type='regex', headers=self.headers) build_all_url = p.get_details() global count count += 1 print('comm:', count) self.get_build_info(build_all_url) except Exception as e: print('小区页面,co_index={},url={}'.format(co_index, comm_url), e)
def get_comm_info(self, comm_list): for i in comm_list: try: comm = Comm(co_index) comm_url = 'http://www.ytfcjy.com/public/project/' + i response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.findall('PROJECT_XMMC">(.*?)<', html, re.S | re.M)[0] comm.co_id = re.findall('ProjectInfo.aspx\?code=(.*?)&', html, re.S | re.M)[0] comm.co_address = re.findall('PROJECT_XMDZ">(.*?)<', html, re.S | re.M)[0] comm.co_develops = re.findall('PROJECT_KFQY_NAME">(.*?)<', html, re.S | re.M)[0] comm.area = re.findall('PROJECT_SZQY">(.*?)<', html, re.S | re.M)[0] comm.co_volumetric = re.findall('PROJECT_RJL">(.*?)<', html, re.S | re.M)[0] comm.co_build_size = re.findall('PROJECT_GHZJZMJ">(.*?)<', html, re.S | re.M)[0] comm.co_pre_sale = re.findall('YSXKZH">(.*?)<', html, re.S | re.M)[0] comm.co_all_house = re.findall('YSZTS">(.*?)<', html, re.S | re.M)[0] comm.co_plan_pro = re.findall('id="ghxkzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0] comm.co_work_pro = re.findall('id="sgxkzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0] comm.co_land_use = re.findall('id="tdzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0] comm.insert_db() global count count += 1 print(count) build_url_list = re.findall('id="buildInfo" value="(.*?)"', html, re.S | re.M) self.get_build_info(build_url_list, comm.co_id) except Exception as e: print(e)
def start_crawler(self): data = { "Submit":"(unable to decode value)" } res = requests.post(self.start_url,data=data,headers=self.headers) html = etree.HTML(res.content.decode('gbk')) comm_url_list = html.xpath("//tr//span[@style='width:270px; color:#006']//a/@href") for comm_url in comm_url_list: try: url = 'http://www.fxfdcw.com/' + comm_url com_res = requests.get(url,headers=self.headers) con = com_res.content.decode('gbk') co = Comm(co_index) co.co_id = re.search('xmid=(\d+)',comm_url).group(1) co.co_name = re.search('项目名称.*?">(.*?)</',con,re.S|re.M).group(1) co.co_develops = re.search('开发企业:(.*?)  ',con,re.S|re.M).group(1) co.co_address = re.search('项目地址.*?">(.*?)</',con,re.S|re.M).group(1) co.co_build_size = re.search('建筑面积.*?">(.*?)</',con,re.S|re.M).group(1) co.co_all_house = re.search('总套数.*?">(.*?)</',con,re.S|re.M).group(1) co.insert_db() bu_list = re.findall("window.open\('(.*?)'\)",con,re.S|re.M) except Exception as e: # log.error("小区信息错误{}".format(e)) print("小区信息错误{}".format(e)) continue self.bu_info(bu_list,co.co_id)
def comm_info(self, comm_url_list): # 小区信息 co = Comm(co_index) build_url_list = [] for comm_url in comm_url_list: co.co_id = re.search('id=(\d+)', comm_url).group(1) detail_url = "http://ris.szpl.gov.cn/bol/" + comm_url.lstrip(".") url = "http://ris.szpl.gov.cn/bolprojectdetail.aspx?id=" + str( co.co_id) try: res = requests.get(detail_url, headers=self.headers) con = res.text co.co_pre_sale = re.search('许可证号.*?">(.*?)&', con).group(1) co.co_name = re.search('项目名称.*?">(.*?)&', con).group(1) co.co_address = re.search('所在位置.*?">(.*?)&', con).group(1) co.co_develops = re.search('发展商.*?">(.*?)&', con).group(1) co_type = re.search('住宅.*?面积.*?">(.*?)平方米.*?套数.*?">(.*?)&', con) co.co_build_size = co_type.group(1) co.co_all_house = co_type.group(2) co.insert_db() response = requests.get(url, headers=self.headers) content = etree.HTML(response.text) build_url = content.xpath("//td/a/@href") build_url_list.extend(build_url) except: continue self.build_info(build_url_list)
def comm_info(self,url): comm_url = self.start_url + "/" + url res = requests.get(comm_url,headers=self.headers) res.encoding = 'gbk' con = res.text co = Comm(co_index) co.co_id = re.search('kfsid=(\d+)',url).group(1) co.co_name = re.search('itemname.*?">(.*?)</font',con).group(1) co.co_develops = re.search('开发商名称:.*?px;">(.*?)</a',con,re.S|re.M).group(1) co.co_all_house = re.search('总套数:.*?">(.*?) ',con,re.S|re.M).group(1) co.co_all_size = re.search('总面积:.*?">(.*?) ',con,re.S|re.M).group(1) co.co_residential_size = re.search('>住宅面积:.*?">(.*?) ',con,re.S|re.M).group(1) co.co_address = re.search('项目座落.*?;">(.*?)</',con,re.S|re.M).group(1) co.area = re.search('所在地区.*?">(.*?)</td',con,re.S|re.M).group(1) try: co.co_build_size = re.search('建筑面积.*?">(.*?) ', con, re.S | re.M).group(1) co.co_plan_project = re.search('建设工程规划许可证号.*?">(.*?)<br',con,re.S|re.M).group(1) co.co_land_use = re.search('土地证号.*?">(.*?)<br',con,re.S|re.M).group(1) co.co_work_pro = re.search('建筑工程施工许可证号.*?">(.*?)<br',con,re.S|re.M).group(1) co.co_use = re.search('用途.*?">(.*?)<br',con,re.S|re.M).group(1) except: co.co_build_size = None co.co_plan_project = None co.co_land_use = None co.co_work_pro = None co.co_us = None co.insert_db() co_html = etree.HTML(con) bu_list = co_html.xpath("//table[@id='preselltable1']/tr[@bgcolor='white']") self.build_info(bu_list,co.co_id)
def comm_info(self,comm_url_list): for comm_url in comm_url_list: try: co_url = 'http://222.77.178.63:7002/' + comm_url co_res = requests.get(co_url,headers=self.headers) con = co_res.content.decode('gbk') co = Comm(co_index) co.co_id = re.search('projectID=(.*)',comm_url).group(1) co.co_name = re.search('项目名称:.*?">(.*?)</',con,re.S|re.M).group(1) co.area = re.search('所在区县:.*?">(.*?)</',con,re.S|re.M).group(1) co.co_address = re.search('项目地址:.*?">(.*?)</',con,re.S|re.M).group(1) co.co_develops = re.search('企业名称:.*?blank">(.*?)</',con,re.S|re.M).group(1) co.co_all_house = re.search('>总套数.*?">(\d+)<',con,re.S|re.M).group(1) co.co_all_size = re.search('>总面积.*?">(.*?)<',con,re.S|re.M).group(1) project_name = parse.quote(co.co_name) co.insert_db() except Exception as e: # log.error('小区信息错误{}'.format(e)) print('小区信息错误{}'.format(e)) sale_url = "http://222.77.178.63:7002/Presell.asp?projectID=" +co.co_id + "&projectname=" + project_name res = requests.get(sale_url,headers=self.headers) html = etree.HTML(res.content.decode('gbk')) temp_url_list = html.xpath("//a/@href") self.build_info(co.co_id,temp_url_list)
def get_comm_info(self, all_url_list): try: c = Comm(co_index) c.co_name = "class='newtopleft font-k'>(.*?)</li>" c.co_id = 'form1" method="post" action="house_base\.aspx\?id=(.*?)"' c.co_address = "项目位置:</li><li class='DetaimidR font-f'>(.*?)</li></ul>" c.area = "地区/商圈:</li><li class='DetaimidR font-f'>(.*?)<" c.co_develops = "开发商:</li><li class='DetaimidR font-f'>(.*?)</li>" c.co_volumetric = "容积率:</li><li class='DetaimidR font-f'>(.*?)<" c.co_green = "绿化率:</li><li class='DetaimidR font-f'>(.*?)<" c.co_all_house = "总户数:</li><li class='DetaimidR font-f'>(.*?)<" c.co_open_time = "开盘时间:</li><li class='DetaimidR font-f'>(.*?)<" c.co_land_use = "国土使用证:</li><li class='DetaimidR font-f'>(.*?)<" c.co_plan_pro = "规划许可证:</li><li class='DetaimidR font-f'>(.*?)<" c.co_build_size = "建筑面积:</li><li class='DetaimidR font-f'>(.*?)<" data_list = c.to_dict() p = ProducerListUrl(page_url=all_url_list, request_type='get', encode='utf-8', analyzer_rules_dict=data_list, analyzer_type='regex', headers=self.headers) p.get_details() global count count += 1 print(count) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, all_url_list), e)
def start_crawler(self): for i in range(1, 10000): formdata = { "currentpage": i, "pagesize": 20, } try: res = requests.post( "http://fdc.xmtfj.gov.cn:8001/home/Getzslp", data=formdata, headers=self.headers) con = json.loads(res.text) body = con['Body'] info_dict = json.loads(body)['bodylist'] for i in info_dict: comm = Comm(co_index) comm.co_name = i['XMMC'] comm.co_id = i['TRANSACTION_ID'] comm.co_address = i['XMDZ'] comm.co_pre_sale = i['YSXKZH'] comm.co_all_house = i['PZTS'] comm.co_build_size = i['PZMJ'] comm.co_area = i['XMDQ'] comm.co_pre_date = i['GETDATE'] comm.insert_db() except Exception as e: print( '小区错误,co_index={},url={},data'.format( co_index, 'http://fdc.xmtfj.gov.cn:8001/home/Getzslp', formdata), e)
def get_comm_info(self, comm_id_list): for i in comm_id_list: try: comm = Comm(co_index) comm_url = 'http://web.xxfdc.gov.cn/onlineQuery/projectInformation.do?xmId=' + i response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('项目地址:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_all_house = re.search('已售总套数:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('已售总面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('行政区别:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_volumetric = re.search('容积率:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_id = i comm.insert_db() bu_html = re.search( '<table class="table table-bordered itemInfoDetail.*?</table>', html, re.S | re.M).group() build_info_list = re.findall('<tr>.*?</tr>', bu_html, re.S | re.M)[1:] for i in build_info_list: try: build = Building(co_index) build.bu_num = re.search('<td>(.*?)<', i, re.S | re.M).group(1) build.bu_all_house = re.search( '<td>.*?<td>.*?<td>(.*?)<', i, re.S | re.M).group(1) build.bu_id = re.search('buildId=(.*?)&', i, re.S | re.M).group(1) build.co_id = comm.co_id build.insert_db() house_url = re.search('<a href="(.*?)"', bu_html, re.S | re.M).group(1) response = requests.get(house_url, headers=self.headers) html = response.text house_url_list = re.findall( '<td width="110">.*?<a.*?href="(.*?)"', html, re.S | re.M) self.get_house_info(house_url_list, build.bu_id, comm.co_id) except Exception as e: print( '楼栋错误,co_index={},url={}'.format( co_index, house_url), e) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
def comm_info(self, comm_url_list): for temp in comm_url_list: comm_url = "http://www.njhouse.com.cn/2016/spf/" + temp try: co = Proxy_contact(app_name="nanjing", method='get', url=comm_url, headers=self.headers) co_res = co.contact() except Exception as e: log.error("小区页面访问失败{}".format(e)) continue con = co_res.decode('gbk') comm = Comm(co_index) comm.co_id = re.search('prjid=(\d+)" ta', con).group(1) comm.co_name = re.search('<h2>(.*?)<em>', con).group(1) comm.area = re.search("\[.*?'>(.*?)</a>\]", con).group(1) comm.co_develops = re.search('开发企业</td>.*?">(.*?)</a', con, re.S | re.M).group(1) comm.co_address = re.search('项目地址.*?<td>(.*?)</td', con, re.S | re.M).group(1) comm.co_open_time = re.search('开盘时间.*?<td>(.*?)</td', con, re.S | re.M).group(1) comm.co_use = re.search('用途.*?<td>(.*?)</td', con, re.S | re.M).group(1) comm.co_pre_sale = re.findall("'_blank'>(\d+)</a>", con) # comm.co_land_use = re.search('土地使用.*?span>(.*?)</span',con,re.S|re.M).group(1) comm.co_plan_project = re.search('工程规划.*?span>(.*?)</span', con, re.S | re.M).group(1) comm.co_plan_useland = re.search('用地规划.*?span>(.*?)</span', con, re.S | re.M).group(1) comm.co_work_pro = re.search('施工.*?span>(.*?)</span', con, re.S | re.M).group(1) comm.co_all_house = re.search('入网总套数.*?">(.*?)</td', con, re.S | re.M).group(1) comm.co_all_size = re.search('入网总面积.*?td>(.*?)m', con, re.S | re.M).group(1) comm.insert_db() build_temp = "http://www.njhouse.com.cn/2016/spf/sales.php?prjid=" + str( comm.co_id) while True: try: build_proxy = Proxy_contact(app_name="nanjing", method='get', url=build_temp, headers=self.headers) build_temp_con = build_proxy.contact() build_temp_con = build_temp_con.decode('gbk') html = etree.HTML(build_temp_con) break except: continue build_url_list = html.xpath("//div[@class='fdxs_left']/a/@href") self.build_info(build_url_list, comm.co_id)
def get_comm_info(self, comm_info_list): for i in comm_info_list: try: comm = Comm(co_index) comm.co_name = re.search('<td>(.*?)</td>', i, re.S | re.M).group(1) comm.co_all_house = re.search('<td.*?<td>(.*?)</td>', i, re.S | re.M).group(1) comm.co_all_size = re.search('<td.*?<td.*?<td>(.*?)</td>', i, re.S | re.M).group(1) comm.insert_db() except Exception as e: print('小区错误,co_index={},html_str={}'.format(co_index, i), e)
def get_comm_detail(self, detail_url, area): try: comm = Comm(co_index) comm_detail_url = 'http://www.yfci.gov.cn:8080/HousePresell/' + detail_url response = requests.get(comm_detail_url, headers=self.headers) html = response.text comm.co_develops = re.search('id="kfsmc".*?<a.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_name = re.search('id="PresellName".*?<a.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('id="HouseRepose".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('id="PresellArea".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_all_house = re.search('id="djrqtd".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_land_use = re.search('id="landinfo".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_type = re.search('id="zczjtd".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale = re.search('id="bookid".*?<a.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale_date = re.search('id="FZDatebegin".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_open_time = re.search('id="kpdate".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_id = re.search('FD=(.*?)&', detail_url, re.S | re.M).group(1) comm.area = area comm.insert_db() build_html = re.search('id="donglist".*?</table>', html, re.S | re.M).group() build_info_list = re.findall('<tr.*?</tr>', build_html, re.S | re.M) for i in build_info_list: build = Building(co_index) build.co_id = comm.co_id build.bu_address = re.search('<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_num = re.search('<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_floor = re.search('<td.*?<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1) build.bu_id = re.search("LID=(.*?)$", house_url, re.S | re.M).group(1) build.insert_db() self.get_house_info(house_url, comm.co_id, build.bu_id) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_detail_url), e)
def comm_parse(self, url_list): # 小区信息解析 co = Comm(co_index) # url_list = Queue() while True: url, area, type = url_list.get() try: res = requests.get(url, headers=self.headers) except Exception as e: print("co_index={},小区详情页无法访问".format(co_index), e) continue con = res.text co.area = area co.co_type = type co.co_id = re.search('id=(\d+)', url).group(1) co.co_develops = re.search('企业名称.*?> (.*?)<', con, re.S | re.M).group(1) co.co_name = re.search('项目名称.*?> (.*?)<', con, re.S | re.M).group(1) co.co_address = re.search('项目座落.*?> (.*?)<', con, re.S | re.M).group(1) co.co_use = re.search('房屋用途.*?> (.*?)<', con, re.S | re.M).group(1) try: co.co_pre_sale = re.search('许可证号.*?> (.*?)<', con, re.S | re.M).group(1) except: co.co_pre_sale = None new_url = "http://ys.tyfdc.gov.cn/Firsthand/tyfc/publish/p/ProjInfo.do?propid=" + co.co_id a_res = requests.get(new_url, headers=self.headers) a_con = a_res.text co.co_build_size = re.search('建筑面积.*?> (.*?)<', a_con, re.S | re.M).group(1) co.co_all_house = re.search('销售套数.*?> (.*?)<', a_con, re.S | re.M).group(1) co.co_green = re.search('绿化率.*?> (.*?)<', a_con, re.S | re.M).group(1) co.co_build_start_time = re.search('开工日期.*?> (.*?)<', a_con, re.S | re.M).group(1) co.co_build_end_time = re.search('竣工日期.*?> (.*?)<', a_con, re.S | re.M).group(1) co.co_volumetric = re.search('容积率.*?> (.*?)<', a_con, re.S | re.M).group(1) co.insert_db() global count count += 1 print(count) try: self.build_parse(co.co_id, ) except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e)
def get_comm_detail(self, comm_detail_url): comm = Comm(co_index) try: response = requests.get(comm_detail_url, headers=self.headers) html = response.text comm.co_pre_sale = re.search('预售许可证号:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_land_use = re.search('土地使用权证号及用途:.*?<td.*?>(.*?)</td', html, re.S | re.M).group(1) comm.co_build_size = re.search('本期预售总建筑面积:.*?<td.*?>(.*?)</td', html, re.S | re.M).group(1) comm.co_all_house = re.search('本期总单元套数:.*?<td.*?>(.*?)</td', html, re.S | re.M).group(1) comm.co_pre_sale_date = re.search('发证日期:.*?<td.*?>(.*?)</td', html, re.S | re.M).group(1) return comm except Exception as e: print('小区详情错误,co_index={},url={}'.format(co_index, comm_detail_url), e) return comm
def get_comm_info(self, comm_url_list): for comm_url in comm_url_list: comm_detail = "http://xx.yyfdcw.com" + comm_url try: comm_res = requests.get(comm_detail, headers=self.headers) except Exception as e: print("co_index={},小区详情页无法访问".format(co_index), e) continue con = comm_res.text comm = Comm(co_index) comm.co_id = re.search('ID=(\d+)', con).group(1) comm.co_name = re.search('lpname">.*?<h2>(.*?)</h2', con, re.S | re.M).group(1) comm.co_develops = re.search('开发商:.*?Kfs">(.*?)</span', con, re.S | re.M).group(1) comm.co_green = re.search('绿化率:.*?Lhl">(.*?)</span', con, re.S | re.M).group(1) comm.area = re.search('区域:.*?Name">(.*?)</span', con, re.S | re.M).group(1) comm.co_address = re.search('位置:</b>(.*?)</li', con, re.S | re.M).group(1) comm.co_build_size = re.search('建筑面积:.*?l5">(.*?)</span', con, re.S | re.M).group(1) comm.co_all_house = re.search('总户数:.*?hs">(.*?)</span', con, re.S | re.M).group(1) comm.co_plan_useland = re.search('用地.*?l4">(.*?)</span', con, re.S | re.M).group(1) comm.co_plan_project = re.search('工程.*?l3">(.*?)</span', con, re.S | re.M).group(1) comm.co_build_type = re.search('楼盘类型.*?Type">(.*?)</span', con, re.S | re.M).group(1) comm.co_all_size = re.search('占地面积.*?mianji">(.*?)</span', con, re.S | re.M).group(1) comm.co_land_use = re.search('使用权证.*?l1">(.*?)</span', con, re.S | re.M).group(1) comm.insert_db() try: build_list = re.findall( '<td align="center">.*?<a href="(.*?)"', con, re.S | re.M) if len(build_list) > 0: self.get_build_info(build_list, comm.co_id) else: print("co_index={},小区co_id={}没有楼栋".format( co_index, comm.co_id)) continue except: print("co_index={},小区co_id={}没有楼栋".format( co_index, comm.co_id)) continue
def get_comm_info(self, comm_list): for i in comm_list: try: comm = Comm(co_index) comm_url = 'http://www.lhfdc.gov.cn/templets/lh/aspx/hpms/ProjectInfo.aspx?code=' + str( i) response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.search('id="PROJECT_XMMC1">(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('id="PROJECT_XMDZ">(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('id="PROJECT_KFQY_NAME1">(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('id="PROJECT_SZQY">(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('id="PROJECT_GHZJZMJ">(.*?)<', html, re.S | re.M).group(1) comm.co_volumetric = re.search('id="PROJECT_RJL">(.*?)<', html, re.S | re.M).group(1) comm.co_build_start_time = re.search( 'id="PROJECT_JHKGRQ">(.*?)<', html, re.S | re.M).group(1) comm.co_build_end_time = re.search( 'id="PROJECT_JHJGRQ">(.*?)<', html, re.S | re.M).group(1) house_all = re.search('id="lbYsZZTs">(.*?)<', html, re.S | re.M).group(1) house_all_a = re.search('id="lbWsZZTs">(.*?)<', html, re.S | re.M).group(1) bus_all = re.search('id="lbWsSYTs">(.*?)<', html, re.S | re.M).group(1) bus_all_a = re.search('id="lbYsSYTs">(.*?)<', html, re.S | re.M).group(1) comm.co_all_house = int(house_all_a) + int(house_all) + int( bus_all) + int(bus_all_a) area_size_a = re.search('id="lbYsZZMj">(.*?)<', html, re.S | re.M).group(1) area_size_b = re.search('id="lbWsZZMj">(.*?)<', html, re.S | re.M).group(1) area_size_c = re.search('id="lbWsSYMj">(.*?)<', html, re.S | re.M).group(1) area_size_d = re.search('id="lbYsSYMj">(.*?)<', html, re.S | re.M).group(1) comm.co_size = float(area_size_a) + float(area_size_b) + float( area_size_c) + float(area_size_d) comm.co_id = str(i) comm.insert_db() self.get_build_info(comm.co_id) except Exception as e: print('小区 错误,co_index={},url={}'.format(co_index, comm_url), e)
def comm_info(self, url_list): for temp_url in url_list: try: comm = Comm(co_index) comm.co_id = re.search('Jh=(.*?\d+)', temp_url).group(1) parse_url = parse.quote(comm.co_id, encoding='gbk') comm_url = 'http://scxx.fgj.wuhan.gov.cn/3.asp?DengJh=' + parse_url proxy = Proxy_contact(app_name='wuhan', method='get', url=comm_url, headers=self.headers) res = proxy.contact() # res = requests.get(comm_url,headers=self.headers) con = res.decode('gb18030') # comm.co_id = re.search('Jh=(.*?)',temp_url).group(1) comm.co_name = re.search('项目名称.*?">(.*?)<', con, re.S | re.M).group(1) comm.co_all_house = re.search('套数.*?">(.*?) ', con, re.S | re.M).group(1) comm.co_address = re.search('坐落.*?">(.*?)</', con, re.S | re.M).group(1) comm.co_build_start_time = re.search('开工时间.*?">(.*?)</', con, re.S | re.M).group(1) comm.co_build_end_time = re.search('竣工时间.*?">(.*?)</', con, re.S | re.M).group(1) comm.co_size = re.search('用地面积.*?">(.*?) ', con, re.S | re.M).group(1) comm.co_build_size = re.search('建筑面积.*?">(.*?) ', con, re.S | re.M).group(1) comm.co_volumetric = re.search('容积率.*?">(.*?)</', con, re.S | re.M).group(1) comm.co_develops = re.search('开发企业</TD>.*?">(.*?)</TD', con, re.S | re.M).group(1) comm.co_land_use = re.search('土地使用证号.*?">(.*?)</', con, re.S | re.M).group(1) comm.co_plan_useland = re.search('用地规划许可证号.*?">(.*?)</', con, re.S | re.M).group(1) comm.co_plan_project = re.search('工程规划许可证号.*?">(.*?)</', con, re.S | re.M).group(1) comm.co_work_pro = re.search('施工许可证号.*?">(.*?)</', con, re.S | re.M).group(1) comm.insert_db() log.debug('{}插入成功'.format(comm.co_name)) except Exception as e: log.error('小区错误{}'.format(e)) continue build_detail = re.sub('3', '4', comm_url) self.build_info(build_detail, comm.co_id)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = 'http://old.newhouse.cnnbfdc.com/' + i response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.findall('项目名称:.*?<span.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_address = re.findall('项目地址:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_develops = re.findall('开发公司:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_pre_sale = re.findall('预\(现\)售证名称:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_build_size = re.findall('纳入网上可售面积:.*?<img.*?>(.*?)<', html, re.S | re.M)[0].replace( 'm²', '').strip() comm.co_all_house = re.findall('纳入网上可售套数:.*?<img.*?>(.*?)<', html, re.S | re.M)[0].replace( '套', '').strip() comm.area = re.findall('所在区县:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_id = re.findall('mobanshow.aspx\?projectid=(.*?)"', html, re.S | re.M)[0].strip() comm.insert_db() global count count += 1 print(count) build_url_list = re.findall("window.open\('(.*?)'", html, re.S | re.M) bu_name_list = re.findall("window.open.*?<font.*?>(.*?)<", html, re.S | re.M) bu_all_house_list = re.findall("window.open.*?<td.*?>(.*?)<", html, re.S | re.M) qrykey = re.findall("qrykey=(.*?)&", html, re.S | re.M) for index in range(len(build_url_list)): try: build = Building(co_index) build.bu_name = bu_name_list[index].strip() build.bu_all_house = bu_all_house_list[index].strip() build.co_id = comm.co_id build.bu_id = qrykey[index].strip() build.insert_db() except Exception as e: print(e) self.get_house_info(build_url_list) except Exception as e: print(e)
def get_comm_info(self, co_url_list): for i in co_url_list: comm = Comm(co_index) comm_url = 'http://183.63.60.194:8808/public/web/ysxm?ysxmid=' + i try: time.sleep(1) response = self.s.get(comm_url, headers=self.headers) html = response.text comm.co_id = re.search('ysxmid=(.*?)$', comm_url).group(1) comm.co_develops = re.findall('kfsmc.*?<a.*?>(.*?)<', html, re.S | re.M)[0] comm.co_name = re.findall('PresellName.*?<a.*?>(.*?)<', html, re.S | re.M)[0] comm.co_address = re.findall('ItemRepose.*?>(.*?)<', html, re.S | re.M)[0] comm.co_build_size = re.findall('PresellArea.*?>(.*?)<', html, re.S | re.M)[0] comm.co_all_house = re.findall('djrqtd.*?>(.*?)<', html, re.S | re.M)[0] comm.co_land_use = re.findall('landinfo.*?>(.*?)<', html, re.S | re.M)[0] comm.co_type = re.findall('zczjtd.*?>(.*?)<', html, re.S | re.M)[0] comm.area = re.findall('FQ.*?>(.*?)<', html, re.S | re.M)[0] comm.co_pre_sale_date = re.findall('FZDatebegin.*?>(.*?)<', html, re.S | re.M)[0] comm.co_pre_sale = re.findall('bookid.*?<a.*?>(.*?)<', html, re.S | re.M)[0] comm.insert_db() bu_address_list = re.findall( 'onmouseout.*?center.*?center">(.*?)<', html, re.S | re.M) bu_num_list = re.findall( 'onmouseout.*?center.*?center.*?center">(.*?)<', html, re.S | re.M) bu_floor_list = re.findall( 'onmouseout.*?center.*?center.*?center.*?center">(.*?)<', html, re.S | re.M) bu_url_list = re.findall('onmouseout.*?href="(.*?)"', html, re.S | re.M) self.get_build_info(bu_address_list, bu_num_list, bu_floor_list, bu_url_list, comm.co_id) global count count += 1 print(count) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: comm = Comm(co_index) comm_url = 'http://old.newhouse.cnnbfdc.com/' + i try: response = requests.get(comm_url, headers=self.headers) except Exception as e: print("{}城市无法访问小区{}".format(city, comm_url), e) continue html = response.text con = etree.HTML(html) comm.co_id = re.search('id=(\d+)', i).group(1) comm.co_name = re.findall('项目名称:.*?<span.*?>(.*?)<', html, re.S | re.M)[0] comm.co_address = re.findall('项目地址:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_develops = re.findall('开发公司:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_pre_sale = re.findall('售证名称:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_build_size = re.findall('纳入网上可售面积:.*?<img.*?>(.*?)<', html, re.S | re.M)[0] comm.co_all_house = re.findall('纳入网上可售套数:.*?<img.*?>(.*?)<', html, re.S | re.M)[0] comm.area = re.findall('所在区县:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.insert_db() bu_all_house_list = re.findall( 'window.open.*?center.*?center.*?>(.*?)<', html, re.S | re.M) try: bu_url_list = re.findall("window\.open\('(.*?)'", html, re.S | re.M) except Exception as e: print("{}城市{}小区无楼栋".format(city, comm.co_name), e) continue for i in range(len(bu_url_list)): build = Building(co_index) bu_url = bu_url_list[i] build.bu_all_house = bu_all_house_list[i] build.co_name = comm.co_name build.bu_num = con.xpath("//a[@href='#']/@title")[i] build.bu_id = re.search('key=(\d+)&', bu_url).group(1) build.co_id = comm.co_id build.insert_db() self.get_house_info(bu_url, build.bu_id)
def start_crawler(self): start_url = self.start_url + "searchSpf.jsp?nowPage=1" b = AllListUrl( first_page_url=start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='/(\d+)页', ) page = b.get_page_count() for i in range(1, int(page) + 1): url = self.start_url + "searchSpf.jsp?nowPage=" + str(i) res = requests.get(url, headers=self.headers) html = etree.HTML(res.content.decode()) url_list = html.xpath("//b/a/@href") for comm_temp in url_list: try: comm_url = self.start_url + comm_temp.replace( "./xmxxmainNew", 'xmxx/xmjbxx') com_res = requests.get(comm_url, headers=self.headers) con = com_res.content.decode('gbk') co = Comm(co_index) co.co_id = re.search('Id_xmxq=(.*)', comm_temp).group(1) co.co_name = re.search('3a3a3a">(.*?)</b', con).group(1) co.co_address = re.search('项目地址.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_develops = re.search('开 发 商.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_all_house = re.search('总 套 数.*?<td>(.*?)</td', con, re.S | re.M).group(1) co.co_green = re.search('绿 化 率.*?<td>(.*?)</td', con, re.S | re.M).group(1) co.co_volumetric = re.search('容 积 率.*?<td>(.*?)</td', con, re.S | re.M).group(1) try: co.co_build_size = re.search('建设规模.*?" >(.*?)平', con, re.S | re.M).group(1) except: co.co_build_size = None co.insert_db() except Exception as e: log.error('{}小区错误{}'.format(comm_temp, e)) self.build_parse(co.co_id)
def comm_info(self, co_id): comm_url = "http://www.lsjs.gov.cn/WebLSZFGB/LPDetail.aspx?RowGuid=" + co_id co_res = requests.get(comm_url, headers=self.headers) con = co_res.text co = Comm(co_index) co.co_name = re.search('楼 盘 名 称:(.*?)<br', con).group(1) co.co_id = co_id co.area = re.search('所 属 城 区:.*?">(.*?)</span', con).group(1) co.co_address = re.search('楼 盘 坐 落:.*?">(.*?)</span', con).group(1) co.co_develops = re.search('项 目 公 司:.*?mc">(.*?)</span', con, re.S | re.M).group(1) co.co_pre_sale = re.search('预销售证号.*?">(.*?)</span', con, re.S | re.M).group(1) co.co_all_house = re.search('预售总套数.*?td>(.*?)</td', con, re.S | re.M).group(1) co.co_all_size = re.search('预售总面积.*?td>(.*?)</td', con, re.S | re.M).group(1) co.co_pre_sale_date = re.search('时间.*?">(.*?)</span', con, re.S | re.M).group(1) co.insert_db() url = 'http://www.lsjs.gov.cn/WebLSZFGB/Ashx/YSXM.ashx' count = 1 while True: data = { "method": "getzxl", "PageSize": 5, "CurrentPageIndex": str(count), "YSXMID": co_id, # 'Searchkey':'' } res = requests.post(url, data=data, headers=self.headers) con_dict = json.loads(res.text) num = con_dict["data"][0]['TotalNum'] info_list = con_dict["data"][1:] for info in info_list: bu_id = info["YSZID"] self.build_info(co_id, bu_id) if int(num) < count * 5: break else: count += 1 continue
def get_comm_info(self, comm_url): comm = Comm(co_index) comm_url = comm_url.replace('buildingdetail', 'buildinfo') response = self.request_proxy(comm_url, headers=self.headers) html = response.content.decode('gbk') comm.co_name = re.search('class="sf_xq_xmmc">(.*?)<', html, re.S | re.M).group(1).strip() comm.area = re.search('id="Label_CityArea">(.*?)<', html, re.S | re.M).group(1).strip() comm.co_pre_sale_date = re.search('class="sf_xq_jfsj">(.*?)<', html, re.S | re.M).group(1).strip() comm.co_build_type = re.search('id="lbl_JZJG".*?>(.*?)<', html, re.S | re.M).group(1).strip() comm.co_address = re.search('id="Label_ProjectAdress">(.*?)<', html, re.S | re.M).group(1).strip() comm.co_pre_sale = re.search('id="Label_SallPreDocuments">(.*?)<', html, re.S | re.M).group(1).strip() comm.co_all_house = re.search('id="lbl_ZTS".*?>(.*?)<', html, re.S | re.M).group(1).strip() comm.co_build_size = re.search('id="lbl_JZMJ".*?>(.*?)<', html, re.S | re.M).group(1).strip() comm.co_all_size = re.search('id="lbl_ZDMJ".*?>(.*?)<', html, re.S | re.M).group(1).strip() comm.co_develops = re.search('id="Label_DevName">.*?>(.*?)<', html, re.S | re.M).group(1).strip() comm.co_id = re.search('action=.*?buildingid=(.*?)"', html, re.S | re.M).group(1).strip() comm.insert_db() buildingid = re.search('buildingid=(.*?)$', comm_url, re.S | re.M).group(1) self.get_build_info(buildingid, comm.co_id)
def get_comm_detail(self, comm_list): for i in comm_list: try: comm = Comm(co_index) comm_url = 'http://www.lpsfdc.cn/Templets/LPS/aspx/' + i content = requests.get(comm_url) html = content.text co_name_list = re.findall('项目名称:.*?>(.*?)<', html, re.S | re.M) co_id_list = re.findall('hdProjectCode" value="(.*?)"', html, re.S | re.M) co_develops_list = re.findall('开发企业:.*?>(.*?)<', html, re.S | re.M) co_build_size_list = re.findall('TJ_ZMJ">(.*?)<', html, re.S | re.M) co_address_list = re.findall('Pro_XMDZ">(.*?)<', html, re.S | re.M) co_owner_list = re.findall('Pro_ZZZSBH">(.*?)<', html, re.S | re.M) co_pre_sale_list = re.findall('Pro_XKZH">(.*?)<', html, re.S | re.M) co_all_house_list = re.findall('TJ_HZYSTS">(.*?)<', html, re.S | re.M) for i in range(0, len(co_name_list)): try: comm.co_name = co_name_list[i] comm.co_id = co_id_list[i] comm.co_develops = co_develops_list[i] comm.co_build_size = co_build_size_list[i] comm.co_address = co_address_list[i] comm.co_owner = co_owner_list[i] comm.co_pre_sale = co_pre_sale_list[i] comm.co_all_house = co_all_house_list[i] comm.insert_db() # global count # count += 1 # print(count) except Exception as e: print('co_index={}, commiunty error'.format(co_index,), e) build_url_list = re.findall("radiobuild' id='build(.*?)'", html, re.S | re.M) build_name_list = re.findall("radiobuild.*?<span.*?>(.*?)<", html, re.S | re.M) for i in range(0, len(build_url_list)): build = Building(co_index) build.bu_id = build_url_list[i] build.bu_num = build_name_list[i] build.co_id = co_id_list[0] build.insert_db() self.get_build_info(build_url_list) except Exception as e: print(e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = 'https://www.qdfd.com.cn/qdweb/realweb/fh/FhProjectInfo.jsp' data = {'projectID': i} response = requests.post(url=comm_url, data=data, headers=self.headers) html = response.text comm.co_id = i comm.co_name = re.findall('bszn_title">(.*?)<', html, re.S | re.M)[0].strip() comm.area = re.findall('所在区县:.*?<span>(.*?)<', html, re.S | re.M)[0].strip() comm.co_address = re.findall('项目地址:.*?<span>(.*?)<', html, re.S | re.M)[0].strip() comm.co_develops = re.findall('企业名称:.*?<a.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_all_house = re.findall('<td>总套数.*?<td class="xxxx_list3">(.*?)<', html, re.S | re.M)[0].strip() comm.co_build_size = re.findall('<td>总面积.*?<td class="xxxx_list3">(.*?)<', html, re.S | re.M)[0].strip() comm.insert_db() build_logo_list = re.findall('javascript:getBuilingList\("(.*?)"', html, re.S | re.M) self.get_build_info(build_logo_list, i) except Exception as e: print('青岛小区问题,url post data is:={}'.format(data), e)
def start_crawler(self): response = requests.get(url) html = response.text tree = etree.HTML(html) all_url = tree.xpath('//a[@class="a_name"]/@href') for i in all_url: comm = Comm(co_index) if i == '#': continue comm_url = 'http://www.lzfc.com.cn:8080' + i comm.co_name = "cc0.innerHTML='(.*?)'" comm.co_address = "cc1.innerHTML='(.*?)'" comm.area = "cc2.innerHTML='(.*?)'" comm.co_use = "cc4.innerHTML='(.*?)'" comm.co_develops = "cc5.innerHTML='(.*?)'" comm.co_open_time = "cc6.innerHTML='(.*?)'" comm.co_all_house = "cc9.innerHTML='(.*?)'" comm.co_build_size = "cc11.innerHTML='(.*?)'" comm.co_name = "cc0.innerHTML='(.*?)'" comm.co_id = "BaseCode=(.*?)'" p = ProducerListUrl(page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), current_url_rule="queryBuildHerf1.href='(.*?)'", analyzer_type='regex') build_url = p.get_details() for i in build_url: build = Building(co_index) build_detail_url = 'http://www.lzfc.com.cn:8080' + i build.bu_num = 'onclick=comInfoView.*?center">(.*?)<' build.co_use = 'onclick=comInfoView.*?center.*?center">(.*?)<' build.bu_pre_sale = 'onclick=comInfoView.*?center.*?center.*?center"><.*?>(.*?)<' build.bu_all_house = 'onclick=comInfoView.*?center.*?center.*?center.*?center">(.*?)<' build.co_name = 'fontbg_red">(.*?)<' build.bu_id = "onclick=comInfoView\('(.*?)'\)" p = ProducerListUrl(page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), current_url_rule="queryBuildHerf1.href='(.*?)'", analyzer_type='regex') build_url = p.get_details()
def start_crawler(self): for i in self.area_list: data = {'districtID': i} res = requests.post(url='http://www.fangdi.com.cn/complexPro.asp', data=data) html_str = res.content.decode('gbk') # 根据返回结果 获取每个地区的返回分页 url_list = re.findall('value="(/complexpro.*?)"', html_str, re.S | re.M) for k in url_list: response = requests.get('http://www.fangdi.com.cn' + k, headers=self.headers) html = response.content.decode('gbk') comm_html = re.search('位置<.*?页/共', html, re.S | re.M).group() comm_info_list = re.findall('<tr valign=.*?</tr>', comm_html, re.S | re.M) for info in comm_info_list: try: comm = Comm(co_index) comm_url = re.search('<a href=(.*?)>', info, re.S | re.M).group(1) comm.co_name = re.search('<a.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_address = re.search('<a.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_all_house = re.search( '<a.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_all_size = re.search( '<a.*?<td.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.area = re.search( '<a.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_id = re.search('projectID=(.*?)==', info, re.S | re.M).group(1) self.get_comm_info(comm_url, comm) except Exception as e: print( '小区错误,co_index={},url={}'.format( co_index, 'http://www.fangdi.com.cn' + k), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: comm = Comm(co_index) comm_url = 'http://222.139.215.89:81/yscx/' + i response = requests.get(comm_url, headers=self.headers) html = response.content.decode('gbk') comm.co_name = re.search('项目名称.*?<td>(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale = re.search('售许可证号.*?<td>(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('开发建设单位.*?<td>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('项目地址.*?<td>(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('建筑面积.*?<td>(.*?)<', html, re.S | re.M).group(1) comm.co_all_house = re.search('住宅套数.*?<td>(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale_date = re.search('批准日期.*?<td>(.*?)<', html, re.S | re.M).group(1) comm.insert_db()
def get_comm_info(self, comm_url_list): co = Comm(co_index) for url in comm_url_list: comm_url = url + "xinxi.html" try: res = requests.get(comm_url, headers=self.headers) con = res.text html = etree.HTML(con) co.co_id = re.search('/(\d+)', con).group(1) co.co_name = html.xpath("//h1[@class='fl']/a/@title")[0] co.co_address = re.search("楼盘地址.*?>(.*?)</li>", con).group(1) co.co_all_house = re.search("规划户数.*?>(.*?)</li>", con).group(1) co.co_develops = re.search("开 发 商.*?>(.*?)</li>", con).group(1) co.area = re.search("片区.*?>(.*?)</li>", con).group(1) co.co_type = re.search("项目类型.*?>(.*?)</li>", con).group(1) co.co_build_type = re.search("建筑类型.*?>(.*?)</li>", con).group(1) co.co_size = re.search("规划面积.*?>(.*?)</li>", con).group(1) co.co_build_size = re.search("建筑面积.*?>(.*?)</li>", con).group(1) try: co.co_open_time = re.search("开盘时间.*?>(.*?)</li>", con).group(1) except: co.co_open_time = None co.co_green = re.search("绿 化 率.*?>(.*?)</li>", con).group(1) co.co_volumetric = re.search("容 积 率.*?>(.*?)</li>", con).group(1) try: co.co_build_start_time = re.search("开工时间:(.*?)</span>", con).group(1) co.co_build_end_time = re.search("竣工时间:(.*?)</span>", con).group(1) except: co.co_build_start_time = None co.co_build_end_time = None co.insert_db() except: continue
def get_comm_info(self, comm_url_list): for i in comm_url_list: comm_url = 'http://www.fjlyfdc.com.cn/' + i try: comm = Comm(co_index) response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_develops = re.search('公司名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale = re.search('预售许可证:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('项目坐落:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_type = re.search('规划用途:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('建筑面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_volumetric = re.search('容积率:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_green = re.search('绿地率:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_open_time = re.search('开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_end_time = re.search('竣工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_all_house = re.search('批准销售:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_all_size = re.search('批准销售:.*?<td.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_id = re.search('CaseId=(.*?)$', comm_url).group(1) comm.insert_db() build_url_list = re.findall( 'href="(/House/BuildingInfo\?buildingInfoID=.*?&caseID=.*?)"', html, re.S | re.M) self.get_build_info(build_url_list, comm.co_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, comm_url), e)