def get_comm_info(self,comm_url_list): for comm_url in comm_url_list: url = self.url + comm_url try: res = requests.get(url,headers=self.headers) except Exception as e: print("co_index={},小区信息错误".format(co_index),e) continue con = res.text co = Comm(co_index) co.co_id = re.search('Id=(\d+)',comm_url).group(1) co.co_name = re.search('项目名称.*?Name">(.*?)</span',con,re.S|re.M).group(1) co.co_develops = re.search('开发商.*?Name">(.*?)</span',con,re.S|re.M).group(1) co.co_address = re.search('地址.*?Address">(.*?)</span',con,re.S|re.M).group(1) co.co_build_size = re.search('建筑面积.*?jzmj">(.*?)</span',con,re.S|re.M).group(1) co.co_type = re.search('项目类型.*?Type">(.*?)</span',con,re.S|re.M).group(1) co.co_size = re.search('占地面积.*?mzgm">(.*?)</span',con,re.S|re.M).group(1) co.co_green = re.search('绿化率.*?Jdl">(.*?)</span',con,re.S|re.M).group(1) co.co_volumetric = re.search('容积率.*?Rjl">(.*?)</span',con,re.S|re.M).group(1) co.co_build_start_time = re.search('开工日期.*?kgrq">(.*?)</span',con,re.S|re.M).group(1) co.co_build_end_time = re.search('竣工日期.*?syrq">(.*?)</span',con,re.S|re.M).group(1) co.insert_db() presell_url_list = re.findall('【<a href="(.*?)" target="_self"',con,re.S|re.M) self.get_build_info(presell_url_list,co.co_id)
def comm_info( self, con, ): # 小区及楼栋 comm = Comm(co_index) comm.co_name = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_web_item_retail1_lb_item_name']/text()" )[0] # 小区名称 co_id_str = con.xpath("//form[@id='aspnetForm']/@action")[0] # 小区id comm.co_id = re.search(r"\d+", co_id_str).group(0) comm.co_address = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_seat']/text()")[ 0] # 小区地址 comm.co_develops = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_enter_name']/text()")[ 0] # 开发商 comm.co_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_area']/text()")[0] # 总面积 comm.co_build_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_area']/text()")[ 0] # 建筑面积 comm.co_build_end_time = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_ew_date']/text()")[ 0] # 竣工时间 comm.co_plan_pro = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_program_pcode']/text()")[ 0] # 用地规划许可 comm.co_work_pro = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_jg']/text()")[0] # 施工许可 comm.co_green = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_green_rate']/text()" )[0] # 绿地百分比 comm.co_land_use = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_td']/text()")[0] # 土地使用证 comm.insert_db() build = Building(co_index) build_table = con.xpath("//tr[@style='color:#000066;']") room_list = [] for build_list in build_table: build.co_id = comm.co_id build.co_name = comm.co_name build_info = build_list.xpath("./td/text()") build.bu_id = build_info[0] build.bu_num = build_info[1] build.bu_all_house = build_info[2] build.size = build_info[3] build.bu_floor = build_info[4] build.bu_pre_sale = build_info[5] build.insert_db() room_url = build_list.xpath("./td/a/@href")[0] room_list.append(room_url) return room_list
def get_comm_info(self, comm_url_list): co = Comm(co_index) for url in comm_url_list: comm_url = url + "xinxi.html" try: res = requests.get(comm_url, headers=self.headers) con = res.text html = etree.HTML(con) co.co_id = re.search('/(\d+)', con).group(1) co.co_name = html.xpath("//h1[@class='fl']/a/@title")[0] co.co_address = re.search("楼盘地址.*?>(.*?)</li>", con).group(1) co.co_all_house = re.search("规划户数.*?>(.*?)</li>", con).group(1) co.co_develops = re.search("开 发 商.*?>(.*?)</li>", con).group(1) co.area = re.search("片区.*?>(.*?)</li>", con).group(1) co.co_type = re.search("项目类型.*?>(.*?)</li>", con).group(1) co.co_build_type = re.search("建筑类型.*?>(.*?)</li>", con).group(1) co.co_size = re.search("规划面积.*?>(.*?)</li>", con).group(1) co.co_build_size = re.search("建筑面积.*?>(.*?)</li>", con).group(1) try: co.co_open_time = re.search("开盘时间.*?>(.*?)</li>", con).group(1) except: co.co_open_time = None co.co_green = re.search("绿 化 率.*?>(.*?)</li>", con).group(1) co.co_volumetric = re.search("容 积 率.*?>(.*?)</li>", con).group(1) try: co.co_build_start_time = re.search("开工时间:(.*?)</span>", con).group(1) co.co_build_end_time = re.search("竣工时间:(.*?)</span>", con).group(1) except: co.co_build_start_time = None co.co_build_end_time = None co.insert_db() except: continue
def comm_parse(self, url_list): # 小区信息解析 co = Comm(co_index) # url_list = Queue() while True: url, area, type = url_list.get() try: res = requests.get(url, headers=self.headers) except Exception as e: print("co_index={},小区详情页无法访问".format(co_index), e) continue con = res.text co.area = area co.co_type = type co.co_id = re.search('id=(\d+)', url).group(1) co.co_develops = re.search('企业名称.*?> (.*?)<', con, re.S | re.M).group(1) co.co_name = re.search('项目名称.*?> (.*?)<', con, re.S | re.M).group(1) co.co_address = re.search('项目座落.*?> (.*?)<', con, re.S | re.M).group(1) co.co_use = re.search('房屋用途.*?> (.*?)<', con, re.S | re.M).group(1) try: co.co_pre_sale = re.search('许可证号.*?> (.*?)<', con, re.S | re.M).group(1) except: co.co_pre_sale = None new_url = "http://ys.tyfdc.gov.cn/Firsthand/tyfc/publish/p/ProjInfo.do?propid=" + co.co_id a_res = requests.get(new_url, headers=self.headers) a_con = a_res.text co.co_build_size = re.search('建筑面积.*?> (.*?)<', a_con, re.S | re.M).group(1) co.co_all_house = re.search('销售套数.*?> (.*?)<', a_con, re.S | re.M).group(1) co.co_green = re.search('绿化率.*?> (.*?)<', a_con, re.S | re.M).group(1) co.co_build_start_time = re.search('开工日期.*?> (.*?)<', a_con, re.S | re.M).group(1) co.co_build_end_time = re.search('竣工日期.*?> (.*?)<', a_con, re.S | re.M).group(1) co.co_volumetric = re.search('容积率.*?> (.*?)<', a_con, re.S | re.M).group(1) co.insert_db() global count count += 1 print(count) try: self.build_parse(co.co_id, ) except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e)
def get_comm_info(self, comm_url, co_id): comm = Comm(co_index) response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.search('项目名称:.*?class="left">(.*?)</td>', html, re.S | re.M).group(1) comm.co_develops = re.search('主开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('项目建设地址:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('项目总规划面积(㎡):.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_start_time = re.search('计划开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_end_time = re.search('计划竣工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('所属片区:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_size = re.search('占地面积(㎡):.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_id = co_id comm.insert_db()
def comm_info(self, url_list): for temp_url in url_list: try: comm = Comm(co_index) comm.co_id = re.search('Jh=(.*?\d+)', temp_url).group(1) parse_url = parse.quote(comm.co_id, encoding='gbk') comm_url = 'http://scxx.fgj.wuhan.gov.cn/3.asp?DengJh=' + parse_url proxy = Proxy_contact(app_name='wuhan', method='get', url=comm_url, headers=self.headers) res = proxy.contact() # res = requests.get(comm_url,headers=self.headers) con = res.decode('gb18030') # comm.co_id = re.search('Jh=(.*?)',temp_url).group(1) comm.co_name = re.search('项目名称.*?">(.*?)<', con, re.S | re.M).group(1) comm.co_all_house = re.search('套数.*?">(.*?) ', con, re.S | re.M).group(1) comm.co_address = re.search('坐落.*?">(.*?)</', con, re.S | re.M).group(1) comm.co_build_start_time = re.search('开工时间.*?">(.*?)</', con, re.S | re.M).group(1) comm.co_build_end_time = re.search('竣工时间.*?">(.*?)</', con, re.S | re.M).group(1) comm.co_size = re.search('用地面积.*?">(.*?) ', con, re.S | re.M).group(1) comm.co_build_size = re.search('建筑面积.*?">(.*?) ', con, re.S | re.M).group(1) comm.co_volumetric = re.search('容积率.*?">(.*?)</', con, re.S | re.M).group(1) comm.co_develops = re.search('开发企业</TD>.*?">(.*?)</TD', con, re.S | re.M).group(1) comm.co_land_use = re.search('土地使用证号.*?">(.*?)</', con, re.S | re.M).group(1) comm.co_plan_useland = re.search('用地规划许可证号.*?">(.*?)</', con, re.S | re.M).group(1) comm.co_plan_project = re.search('工程规划许可证号.*?">(.*?)</', con, re.S | re.M).group(1) comm.co_work_pro = re.search('施工许可证号.*?">(.*?)</', con, re.S | re.M).group(1) comm.insert_db() log.debug('{}插入成功'.format(comm.co_name)) except Exception as e: log.error('小区错误{}'.format(e)) continue build_detail = re.sub('3', '4', comm_url) self.build_info(build_detail, comm.co_id)
def get_comm_info(self, comm_list): for i in comm_list: try: comm = Comm(co_index) comm_url = 'http://www.lhfdc.gov.cn/templets/lh/aspx/hpms/ProjectInfo.aspx?code=' + str( i) response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.search('id="PROJECT_XMMC1">(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('id="PROJECT_XMDZ">(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('id="PROJECT_KFQY_NAME1">(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('id="PROJECT_SZQY">(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('id="PROJECT_GHZJZMJ">(.*?)<', html, re.S | re.M).group(1) comm.co_volumetric = re.search('id="PROJECT_RJL">(.*?)<', html, re.S | re.M).group(1) comm.co_build_start_time = re.search( 'id="PROJECT_JHKGRQ">(.*?)<', html, re.S | re.M).group(1) comm.co_build_end_time = re.search( 'id="PROJECT_JHJGRQ">(.*?)<', html, re.S | re.M).group(1) house_all = re.search('id="lbYsZZTs">(.*?)<', html, re.S | re.M).group(1) house_all_a = re.search('id="lbWsZZTs">(.*?)<', html, re.S | re.M).group(1) bus_all = re.search('id="lbWsSYTs">(.*?)<', html, re.S | re.M).group(1) bus_all_a = re.search('id="lbYsSYTs">(.*?)<', html, re.S | re.M).group(1) comm.co_all_house = int(house_all_a) + int(house_all) + int( bus_all) + int(bus_all_a) area_size_a = re.search('id="lbYsZZMj">(.*?)<', html, re.S | re.M).group(1) area_size_b = re.search('id="lbWsZZMj">(.*?)<', html, re.S | re.M).group(1) area_size_c = re.search('id="lbWsSYMj">(.*?)<', html, re.S | re.M).group(1) area_size_d = re.search('id="lbYsSYMj">(.*?)<', html, re.S | re.M).group(1) comm.co_size = float(area_size_a) + float(area_size_b) + float( area_size_c) + float(area_size_d) comm.co_id = str(i) comm.insert_db() self.get_build_info(comm.co_id) except Exception as e: print('小区 错误,co_index={},url={}'.format(co_index, comm_url), e)
def get_comm_detail(self, comm_detail_url, co_id): comm = Comm(co_index) try: response = requests.get(comm_detail_url, headers=self.headers) html = response.text comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_type = re.search('项目主体性质:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('主开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('项目建设地址:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_all_size = re.search('项目总规划面积(㎡):.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_start_time = re.search('计划开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_end_time = re.search('计划竣工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_id = co_id comm.insert_db() build_info_list = re.findall('id="lpan".*?</tr>', html, re.S | re.M) self.get_build_info(build_info_list, co_id) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_detail_url), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm.co_id = re.search('DevProjectId=(.*?)$', i[0]).group(1) comm.area = i[1] comm_url = 'http://58.51.240.121:8503/' + i[0] response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.search( 'id="ProjectInfo1_lblProjectName">(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search( 'id="ProjectInfo1_lblProjectAddress">(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search( 'id="ProjectInfo1_lblCorpName">(.*?)<', html, re.S | re.M).group(1) comm.co_type = re.search( 'id="ProjectInfo1_lblProjectType">(.*?)<', html, re.S | re.M).group(1) comm.co_size = re.search('id="ProjectInfo1_lblXmzgm">(.*?)<', html, re.S | re.M).group(1) comm.co_build_start_time = re.search( 'id="ProjectInfo1_lblJhkgrq">(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search( 'id="ProjectInfo1_lblZjzmj">(.*?)<', html, re.S | re.M).group(1) comm.co_build_end_time = re.search( 'id="ProjectInfo1_lblJhjfsyrq">(.*?)<', html, re.S | re.M).group(1) comm.co_volumetric = re.search( 'id="ProjectInfo1_lblRjl">(.*?)<', html, re.S | re.M).group(1) comm.co_green = re.search('id="ProjectInfo1_lblJdl">(.*?)<', html, re.S | re.M).group(1) build_url_list = re.findall( 'href="(Pub_ysxx\.aspx\?PresellId=.*?)"', html, re.S | re.M) self.get_build_info(build_url_list, comm) except Exception as e: print('请求错误,co_index={},url={}'.format(co_index, comm_url), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: comm_url = 'http://www.fjlyfdc.com.cn/' + i try: comm = Comm(co_index) response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_develops = re.search('公司名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale = re.search('预售许可证:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('项目坐落:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_type = re.search('规划用途:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('建筑面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_volumetric = re.search('容积率:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_green = re.search('绿地率:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_open_time = re.search('开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_end_time = re.search('竣工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_all_house = re.search('批准销售:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_all_size = re.search('批准销售:.*?<td.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_id = re.search('CaseId=(.*?)$', comm_url).group(1) comm.insert_db() build_url_list = re.findall( 'href="(/House/BuildingInfo\?buildingInfoID=.*?&caseID=.*?)"', html, re.S | re.M) self.get_build_info(build_url_list, comm.co_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, comm_url), e)
def start_crawler(self): b = AllListUrl( first_page_url=self.url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='ount">(\d+)</span></b>页', ) page = b.get_page_count() list_formdata = {} for i in range(1, int(page) + 1): response = requests.post(self.url, data=list_formdata, headers=self.headers) con = etree.HTML(response.text) href_list = con.xpath("//strong/a/@href") view_state = con.xpath("//input[@id='__VIEWSTATE']/@value")[0] valid = con.xpath("//input[@id='__EVENTVALIDATION']/@value")[0] list_formdata["__VIEWSTATE"] = view_state # 保存当前页的信息作为下一页请求参数 list_formdata["__EVENTVALIDATION"] = valid list_formdata[ "ctl00$ContentPlaceHolder1$PageNavigator_NewHouse1$txtNewPageIndex"] = i list_formdata[ "__EVENTTARGET"] = "ctl00$ContentPlaceHolder1$PageNavigator_NewHouse1$LnkBtnGoto" for href in href_list: new_url = self.url + href res = requests.get(new_url, headers=self.headers) comm_con = res.text detail_url = re.search('楼盘明细:.*?"(.*?)"', comm_con).group(1) detail_url = self.url + detail_url response = requests.get(detail_url) html = etree.HTML(response.text) comm_url_list = html.xpath( "//div[@class='Search_results_box']//td/a/@href") for comm_url in comm_url_list: commurl = self.url + comm_url comm_res = requests.get(commurl, headers=self.headers) comm_con = comm_res.text bo_develops = re.search('开发企业.*?">(.*?)</td>', comm_con, re.S | re.M).group(1) if bo_develops is None: continue else: try: co = Comm(co_index) co.co_name = re.search('<h1>(.*?)<span>', comm_con, re.S | re.M).group(1) co.co_develops = bo_develops co.co_id = re.search('MID=(\d+)', comm_url).group(1) co.co_use = re.search('规划用途.*?">(.*?)</td>', comm_con, re.S | re.M).group(1) co.co_address = re.search('项目坐落.*?">(.*?)</td>', comm_con, re.S | re.M).group(1) co.co_build_start_time = re.search( '开工时间.*?">(.*?)</td>', comm_con, re.S | re.M).group(1) co.co_build_end_time = re.search( '竣工时间.*?">(.*?)</td>', comm_con, re.S | re.M).group(1) co.co_size = re.search('土地面积.*?">(.*?)</td>', comm_con, re.S | re.M).group(1) co.co_build_size = re.search( '建筑面积.*?">(.*?)</td>', comm_con, re.S | re.M).group(1) co.co_all_house = re.findall( '#eff6ff">(.*?)</td>', comm_con, re.S | re.M)[0] co.area = re.search('所属区域.*?">(.*?)</td>', comm_con, re.S | re.M).group(1) co.insert_db() global count count += 1 print(count) print(co.co_name) except: continue self.build_crawler(co.co_id, co.co_name, comm_con)