def start_crawler(self): res = requests.get(self.start_url, headers=self.headers) html = etree.HTML(res.text) comm_url_list = html.xpath("//div[@class='post']//a/@href") for comm_url in comm_url_list: try: url = 'http://www.ggsfcw.com/' + comm_url comm_res = requests.get(url, headers=self.headers) com_html = etree.HTML(comm_res.text) comm = Comm(co_index) comm.co_name = re.search('<h3.*?">(.*?)</', comm_res.text).group(1) comm.co_id = re.search('n=(\d+)', comm_res.text).group(1) comm.co_address = re.search('地址.*?">(.*?)</', comm_res.text).group(1) comm.area = re.search('区县.*?">(.*?)</', comm_res.text).group(1) comm.co_develops = re.search('开发商.*?">(.*?)</', comm_res.text).group(1) comm.co_use = re.search('规划用途.*?">(.*?)</', comm_res.text).group(1) comm.insert_db() except Exception as e: log.error("小区信息错误", e) continue bu_list = com_html.xpath("//div[@id='MainContent_divResult']/a") self.build_info(bu_list, comm.co_id)
def co_parse(self, url_list): for url in url_list: try: co_url = url.xpath("./@href")[0] new_url = "http://tmsf.qzfdcgl.com" + co_url co_res = requests.get(new_url, headers=self.headers) con = co_res.text co = Comm(co_index) co.co_id = re.search('property_(.*?)_info', co_url).group(1) co.co_name = re.search('楼盘名称:</span>(.*)', con).group(1) co.co_develops = re.search('项目公司:</span>(.*)', con).group(1) co.co_address = re.search('物业地址:</span>(.*?)</p', con, re.S | re.M).group(1) co.area = re.search('所属城区:</span>(.*)', con).group(1) co.insert_db() sid = re.search('property_(\d+)_', co_url).group(1) propertyid = re.search('(\d+)_info', co_url).group(1) bu_url = new_url.replace('info', 'price') res = requests.get(bu_url, headers=self.headers) bu_html = etree.HTML(res.text) bu_idlist = bu_html.xpath("//dd[@id='building_dd']/a") except: continue for bu_ in bu_idlist[1:]: id = bu_.xpath("./@id")[0] bu_id = re.search('.*?(\d+)', id).group(1) bu = Building(co_index) bu.bu_id = bu_id bu.co_id = co.co_id bu.bu_num = bu_.xpath("./text()")[0] bu.insert_db() self.house_parse(bu_id, co.co_id, sid, propertyid)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = 'http://221.2.144.162:8090/' + i response = requests.get(comm_url, headers=self.headers) html = response.content.decode('gbk') comm.co_id = re.search('id=(\d+)', i).group(1) comm.co_name = re.findall('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_develops = re.findall('开 发 商:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.area = re.findall( '城 区:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_type = re.findall('物业类型:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_address = re.findall('物业位置:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_build_size = re.findall('建筑面积:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.insert_db() build_url_list = re.findall("height=20.*?<a href=(.*?) ", html, re.S | re.M) bu_pre_sale_list = re.findall("height=20.*?<Td>(.*?)<", html, re.S | re.M) self.get_build_info(build_url_list, bu_pre_sale_list, comm.co_name, comm.co_id) except Exception as e: print("co_index={},小区信息错误".format(co_index), e)
def get_comm_info(self, all_url_list): try: c = Comm(co_index) c.co_name = "class='newtopleft font-k'>(.*?)</li>" c.co_id = 'form1" method="post" action="house_base\.aspx\?id=(.*?)"' c.co_address = "项目位置:</li><li class='DetaimidR font-f'>(.*?)</li></ul>" c.area = "地区/商圈:</li><li class='DetaimidR font-f'>(.*?)<" c.co_develops = "开发商:</li><li class='DetaimidR font-f'>(.*?)</li>" c.co_volumetric = "容积率:</li><li class='DetaimidR font-f'>(.*?)<" c.co_green = "绿化率:</li><li class='DetaimidR font-f'>(.*?)<" c.co_all_house = "总户数:</li><li class='DetaimidR font-f'>(.*?)<" c.co_open_time = "开盘时间:</li><li class='DetaimidR font-f'>(.*?)<" c.co_land_use = "国土使用证:</li><li class='DetaimidR font-f'>(.*?)<" c.co_plan_pro = "规划许可证:</li><li class='DetaimidR font-f'>(.*?)<" c.co_build_size = "建筑面积:</li><li class='DetaimidR font-f'>(.*?)<" data_list = c.to_dict() p = ProducerListUrl(page_url=all_url_list, request_type='get', encode='utf-8', analyzer_rules_dict=data_list, analyzer_type='regex', headers=self.headers) p.get_details() global count count += 1 print(count) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, all_url_list), e)
def get_comm_detail(self, comm_list): for i in comm_list: try: comm = Comm(co_index) comm_url = 'http://house.bffdc.gov.cn/public/project/' + i response = requests.get(comm_url) html = response.text comm.co_name = re.search('PROJECT_XMMC">(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('PROJECT_KFQY_NAME">(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('PROJECT_XMDZ">(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('PROJECT_SZQY">(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale = re.search('YSXKZH">(.*?)<', html, re.S | re.M).group(1) comm.insert_db() build_info = re.search('id="buildInfo".*?value="(.*?)"', html, re.S | re.M).group(1) build_url_list = build_info.split(';;') self.get_build_info(build_url_list, comm.co_name) global count count += 1 print(count) except Exception as e: print(e)
def get_comm_info(self, comm_list): for i in comm_list: try: comm = Comm(co_index) comm_url = 'http://www.ytfcjy.com/public/project/' + i response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.findall('PROJECT_XMMC">(.*?)<', html, re.S | re.M)[0] comm.co_id = re.findall('ProjectInfo.aspx\?code=(.*?)&', html, re.S | re.M)[0] comm.co_address = re.findall('PROJECT_XMDZ">(.*?)<', html, re.S | re.M)[0] comm.co_develops = re.findall('PROJECT_KFQY_NAME">(.*?)<', html, re.S | re.M)[0] comm.area = re.findall('PROJECT_SZQY">(.*?)<', html, re.S | re.M)[0] comm.co_volumetric = re.findall('PROJECT_RJL">(.*?)<', html, re.S | re.M)[0] comm.co_build_size = re.findall('PROJECT_GHZJZMJ">(.*?)<', html, re.S | re.M)[0] comm.co_pre_sale = re.findall('YSXKZH">(.*?)<', html, re.S | re.M)[0] comm.co_all_house = re.findall('YSZTS">(.*?)<', html, re.S | re.M)[0] comm.co_plan_pro = re.findall('id="ghxkzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0] comm.co_work_pro = re.findall('id="sgxkzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0] comm.co_land_use = re.findall('id="tdzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0] comm.insert_db() global count count += 1 print(count) build_url_list = re.findall('id="buildInfo" value="(.*?)"', html, re.S | re.M) self.get_build_info(build_url_list, comm.co_id) except Exception as e: print(e)
def get_comm_info(self, comm_url): comm = Comm(co_index) comm_url = comm_url.replace('buildingdetail', 'buildinfo') response = self.request_proxy(comm_url, headers=self.headers) html = response.content.decode('gbk') comm.co_name = re.search('class="sf_xq_xmmc">(.*?)<', html, re.S | re.M).group(1).strip() comm.area = re.search('id="Label_CityArea">(.*?)<', html, re.S | re.M).group(1).strip() comm.co_pre_sale_date = re.search('class="sf_xq_jfsj">(.*?)<', html, re.S | re.M).group(1).strip() comm.co_build_type = re.search('id="lbl_JZJG".*?>(.*?)<', html, re.S | re.M).group(1).strip() comm.co_address = re.search('id="Label_ProjectAdress">(.*?)<', html, re.S | re.M).group(1).strip() comm.co_pre_sale = re.search('id="Label_SallPreDocuments">(.*?)<', html, re.S | re.M).group(1).strip() comm.co_all_house = re.search('id="lbl_ZTS".*?>(.*?)<', html, re.S | re.M).group(1).strip() comm.co_build_size = re.search('id="lbl_JZMJ".*?>(.*?)<', html, re.S | re.M).group(1).strip() comm.co_all_size = re.search('id="lbl_ZDMJ".*?>(.*?)<', html, re.S | re.M).group(1).strip() comm.co_develops = re.search('id="Label_DevName">.*?>(.*?)<', html, re.S | re.M).group(1).strip() comm.co_id = re.search('action=.*?buildingid=(.*?)"', html, re.S | re.M).group(1).strip() comm.insert_db() buildingid = re.search('buildingid=(.*?)$', comm_url, re.S | re.M).group(1) self.get_build_info(buildingid, comm.co_id)
def baiyin_start(self): page = self.get_all_page() print(page) for i in range(1, int(page) + 1): res = requests.get(self.url + '?page=' + str(i), headers=self.headers) html = res.content.decode('gbk') tree = etree.HTML(html) community_list = tree.xpath('//tr[@align="center"]') for i in community_list[1:]: try: comm = Comm(self.CO_INDEX) href = i.xpath('td/a/@href') area = i.xpath('td[1]/text()') if not area: area = None else: area = area[0] href = href[0] comm.area = area self.get_comm_detail(href, comm) except Exception as e: href = i.xpath('td/a/@href') if not href: continue href = href[0] comm_url = self.URL_FRONT + href print('小区错误:', comm_url) print(e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = 'https://www.qdfd.com.cn/qdweb/realweb/fh/FhProjectInfo.jsp' data = {'projectID': i} response = requests.post(url=comm_url, data=data, headers=self.headers) html = response.text comm.co_id = i comm.co_name = re.findall('bszn_title">(.*?)<', html, re.S | re.M)[0].strip() comm.area = re.findall('所在区县:.*?<span>(.*?)<', html, re.S | re.M)[0].strip() comm.co_address = re.findall('项目地址:.*?<span>(.*?)<', html, re.S | re.M)[0].strip() comm.co_develops = re.findall('企业名称:.*?<a.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_all_house = re.findall( '<td>总套数.*?<td class="xxxx_list3">(.*?)<', html, re.S | re.M)[0].strip() comm.co_build_size = re.findall( '<td>总面积.*?<td class="xxxx_list3">(.*?)<', html, re.S | re.M)[0].strip() comm.insert_db() build_logo_list = re.findall( 'javascript:getBuilingList\("(.*?)"', html, re.S | re.M) self.get_build_info(build_logo_list, i) except Exception as e: print('青岛小区问题,url post data is:={}'.format(data), e)
def comm_parse(self, url_list, region): for co_url in url_list: comm_url = "http://110.89.45.7:8082" + co_url comm_res = requests.get(comm_url, headers=self.headers) con = comm_res.text co = Comm(co_index) co.co_id = re.search('ProjectId=(.*)', co_url).group(1) co.co_name = re.search('项目名称.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_develops = re.search('公司名称.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_address = re.search('项目坐落.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_use = re.search('规划用途.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_build_size = re.search('建筑面积.*?">(.*?)</td', con, re.S | re.M).group(1) co.area = region co.co_residential_size = re.search( '批准销售.*?">.*?</td.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_pre_sale = re.search('预售许可证.*?">(.*?)</td', con, re.S | re.M).group(1) co.insert_db() co_html = etree.HTML(comm_res.text) bu_urllist = co_html.xpath("//span/a/@href") self.bu_parse(co.co_id, bu_urllist)
def get_comm_info(self, all_comm_url): for i in all_comm_url: try: comm = Comm(co_index) comm_url = 'http://gold.ncfdc.com.cn/' + i res = requests.get(comm_url, headers=self.headers) comm.co_name = re.search('ctl15_proname">(.*?)<', res.text, re.S | re.M).group(1) comm.co_address = re.search('ctl20_ADDRESS">(.*?)<', res.text, re.S | re.M).group(1) comm.co_develops = re.search('ctl20_developer_name">(.*?)<', res.text, re.S | re.M).group(1) comm.co_build_size = re.search('ctl20_build_area">(.*?)<', res.text, re.S | re.M).group(1) comm.area = re.search('ctl20_region_name">(.*?)<', res.text, re.S | re.M).group(1) comm.co_type = re.search('ctl20_PropertyType">(.*?)<', res.text, re.S | re.M).group(1) comm.co_green = re.search('ctl20_VIRESCENCE">(.*?)<', res.text, re.S | re.M).group(1) comm.co_volumetric = re.search('ctl20_PLAT_RATIO">(.*?)<', res.text, re.S | re.M).group(1) comm.co_id = re.search('name="form1.*?hrefID=(.*?)"', res.text, re.S | re.M).group(1) comm.insert_db() build_url_list = [] for j in re.findall('doc_nav_LD" href="(.*?)"', res.text, re.S | re.M): build_url_list.append(j) self.get_build_info(build_url_list, comm.co_id) except Exception as e: print('小区错误,co_index={}, url={}'.format(co_index, comm_url), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm.co_id = '楼盘首页.*?aid-(.*?)/' comm.co_name = 'class="ls">(.*?)<' comm.co_type = '物业类型</em>(.*?)<' comm.area = '区域所属:</em>(.*?)<' comm.co_green = '绿 化 率:</em>(.*?)<' comm.co_volumetric = '容 积 率:</em>(.*?)<' comm.co_build_type = '楼 层:</em>(.*?)<' comm.co_size = '占地面积:</em>(.*?)<' comm.co_build_size = '建筑面积:</em>(.*?)<' comm.co_develops = '开 发 商:</em><.*?target="_blank">(.*?)<' comm.co_address = '项目地址:</em>(.*?)<' data_list = comm.to_dict() p = ProducerListUrl( page_url=i, request_type='get', encode='gbk', analyzer_rules_dict=data_list, current_url_rule= 'colspan="3" align="right"><a href="(.*?)"', analyzer_type='regex', headers=self.headers) more_build_url = p.get_details() self.get_build_info(more_build_url) except Exception as e: print(e)
def get_comm_detail(self, comm_url): comm = Comm(co_index) co_url = 'http://tz.tmsf.com' + comm_url response = requests.get(co_url, headers=self.headers) html = response.content.decode('utf-8') comm.co_name = re.search('<span class="buidname colordg">(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('楼盘地址:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) if '[' in comm.co_address: comm.area = re.search('\[(.*?)\]', comm.co_address, re.S | re.M).group(1) comm.co_type = re.search('物业类型:.*?<span title="(.*?)"', html, re.S | re.M).group(1) comm.co_open_time = re.search('最新开盘:</strong>(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('项目公司:</strong>(.*?)<', html, re.S | re.M).group(1) comm.co_build_type = re.search('建筑形式:</strong>(.*?)<', html, re.S | re.M).group(1) comm.co_id = re.search('id="propertyid".*?value="(.*?)"', html, re.S | re.M).group(1) comm.insert_db() sid = re.search('id="sid" name="sid" value="(.*?)"', html, re.S | re.M).group(1) build_url = re.search('id="index_bar">楼盘主页.*?href="(.*?)"', html, re.S | re.M).group(1) self.get_build_info(build_url, comm.co_id, sid)
def comm_info(self, comm_url_list): for comm_url in comm_url_list: try: co_url = 'http://222.77.178.63:7002/' + comm_url co_res = requests.get(co_url, headers=self.headers) con = co_res.content.decode('gbk') co = Comm(co_index) co.co_id = re.search('projectID=(.*)', comm_url).group(1) co.co_name = re.search('项目名称:.*?">(.*?)</', con, re.S | re.M).group(1) co.area = re.search('所在区县:.*?">(.*?)</', con, re.S | re.M).group(1) co.co_address = re.search('项目地址:.*?">(.*?)</', con, re.S | re.M).group(1) co.co_develops = re.search('企业名称:.*?blank">(.*?)</', con, re.S | re.M).group(1) co.co_all_house = re.search('>总套数.*?">(\d+)<', con, re.S | re.M).group(1) co.co_all_size = re.search('>总面积.*?">(.*?)<', con, re.S | re.M).group(1) project_name = parse.quote(co.co_name) co.insert_db() except Exception as e: # log.error('小区信息错误{}'.format(e)) print('小区信息错误{}'.format(e)) sale_url = "http://222.77.178.63:7002/Presell.asp?projectID=" + co.co_id + "&projectname=" + project_name res = requests.get(sale_url, headers=self.headers) html = etree.HTML(res.content.decode('gbk')) temp_url_list = html.xpath("//a/@href") self.build_info(co.co_id, temp_url_list)
def get_comm_detail(self, comm_detail_url): comm_url = 'http://www.kmhouse.org' + comm_detail_url try: comm = Comm(co_index) response = requests.get(comm_url, headers=self.headers) html = response.content.decode('gbk') co_id = re.search('Preid=(.*?)&', comm_detail_url).group(1) co_name = re.search('楼盘名称.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) area = re.search('所在地区.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) co_address = re.search('楼盘地址.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) co_pre_sale = re.search('预售证号.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) co_volumetric = re.search('容 积 率.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) co_green = re.search('绿 化 率.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) co_build_start_time = re.search('开工时间.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_name = co_name comm.area = area comm.co_id = co_id comm.co_address = co_address comm.co_pre_sale = co_pre_sale comm.co_volumetric = co_volumetric comm.co_green = co_green comm.co_build_start_time = co_build_start_time comm.insert_db() global count count += 1 print('count:', count) except Exception as e: print('小区详情错误,co_index={},url={}'.format(co_index, comm_url), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: response = requests.get(i, headers=self.headers) html = response.text comm = Comm(co_index) comm.co_name = re.findall('PROJECT_XMMC">(.*?)<', html, re.S | re.M)[0] comm.co_develops = re.findall('PROJECT_KFQY_NAME">(.*?)<', html, re.S | re.M)[0] comm.co_address = re.findall('PROJECT_XMDZ">(.*?)<', html, re.S | re.M)[0] comm.area = re.findall('PROJECT_SZQY">(.*?)<', html, re.S | re.M)[0] comm.co_volumetric = re.findall('PROJECT_RJL">(.*?)<', html, re.S | re.M)[0] comm.co_build_size = re.findall('PROJECT_GHZJZMJ">(.*?)<', html, re.S | re.M)[0] comm.co_pre_sale = re.findall('YSXKZH">(.*?)<', html, re.S | re.M)[0] comm.co_id = re.findall('PROJECT_XMBH">(.*?)<', html, re.S | re.M)[0] comm.insert_db() global count count += 1 print(count) bu_info = re.search('id="buildInfo".*?value="(.*?)"', html, re.S | re.M).group(1) self.get_build_info(bu_info, comm.co_id, i) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, i), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = i.replace('view', 'detail') comm.co_type = '物业类型:.*?<dd>(.*?)<' comm.area = '区域所属:.*?<dd>(.*?)<' comm.co_build_size = '建筑面积:.*?<dd>(.*?)<' comm.co_size = '占地面积:.*?<dd>(.*?)<' comm.co_green = '绿化率:.*?<dd><.*?>(.*?)<' comm.co_build_type = '楼 层:.*?<dd>(.*?)<' comm.co_volumetric = '容积率:.*?<dd><.*?>(.*?)<' comm.co_id = '楼盘首页.*?newhouse/.*?/(.*?)/' comm.co_name = '<h1 class="title">(.*?)<' comm.co_address = '楼盘地址:.*?<dd>(.*?)<' comm.co_develops = '开发商:.*?<dd(.*?)<' p = ProducerListUrl(page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), analyzer_type='regex', headers=self.headers) p.get_details() except Exception as e: print(e)
def start_crawler(self): for i in range(1, 478): data = { "method": "GetYszData", "page": str(i), "ysxkz": '', "kfs": '', "lpmc": '' } res = requests.post(self.start_url, headers=self.headers, data=data) info = res.json() comm = json.loads(info) for detail in comm['Rows']: co = Comm(co_index) co.co_name = detail['PRJNAME'] co.co_pre_sale = detail['PRENUM'] co.area = detail['CZAREA'] co.co_pre_sale_date = detail['PresaleCertificateDate'] co.co_address = detail['BSIT'] co.co_develops = detail['NAME'] co.co_build_size = detail['YSROOMBAREA'] co.co_all_house = detail['YSROOMNUMS'] co.insert_db()
def start_crawler(self): for region in self.region.items(): region_code = region[0] region_name = region[1] url = self.start_url + region_code + '.html' b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='共(\d+)页>', ) page = b.get_page_count() for i in range(1, int(page) + 1): new_url = url + "?page=" + str(i) res = requests.get(new_url, headers=self.headers) html = etree.HTML(res.text) co_list = html.xpath("//dl[@class='spf_lp_searchlist bg1']") for co in co_list: comm = Comm(co_index) co_url = co.xpath("./dt/h4/a/@href")[0] comm.co_name = co.xpath("./dt/h4/a/text()")[0] comm.co_address = co.xpath(".//address/text()")[0] comm.co_id = re.search('\d+', co_url).group(0) comm.co_develops = co.xpath( "./dd[@class='dev']/a/text()")[0] comm.co_plan_pro = co.xpath("./dt/h4/span/text()")[0] comm.co_type = co.xpath(".//p/span[2]/text()")[0] comm.area = region_name comm.insert_db() detail_url = "http://www.zstmsf.com" + co_url self.bu_parse(detail_url, comm.co_id)
def get_comm_detail(self, comm_list): for i in comm_list: comm_url = 'http://www.yzfdc.cn/' + i try: comm = Comm(co_index) content = self.s.get(comm_url, headers=self.headers) html = content.text comm.co_name = re.search('class="zxlp_08".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_id = re.search( 'class="zxlp_08" href=.*?ProjectId=(.*?)"', html, re.S | re.M).group(1) comm.co_develops = re.search('开 发 商:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_type = re.search('项目类型:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('所属区位:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('建筑面积:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_open_time = re.search('开盘日期:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_handed_time = re.search('交付日期:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('项目具体地址:.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) comm.insert_db() build_url = re.search( '(/BuildingDish_Publicity.aspx\?Projectid=.*?)"', html, re.S | re.M).group(1) self.get_build_info(build_url, comm.co_id) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
def get_comm_info(self, comm_id_list): for i in comm_id_list: try: comm = Comm(co_index) comm_url = 'http://web.xxfdc.gov.cn/onlineQuery/projectInformation.do?xmId=' + i response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('项目地址:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_all_house = re.search('已售总套数:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('已售总面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('行政区别:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_volumetric = re.search('容积率:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_id = i comm.insert_db() bu_html = re.search( '<table class="table table-bordered itemInfoDetail.*?</table>', html, re.S | re.M).group() build_info_list = re.findall('<tr>.*?</tr>', bu_html, re.S | re.M)[1:] for i in build_info_list: try: build = Building(co_index) build.bu_num = re.search('<td>(.*?)<', i, re.S | re.M).group(1) build.bu_all_house = re.search( '<td>.*?<td>.*?<td>(.*?)<', i, re.S | re.M).group(1) build.bu_id = re.search('buildId=(.*?)&', i, re.S | re.M).group(1) build.co_id = comm.co_id build.insert_db() house_url = re.search('<a href="(.*?)"', bu_html, re.S | re.M).group(1) response = requests.get(house_url, headers=self.headers) html = response.text house_url_list = re.findall( '<td width="110">.*?<a.*?href="(.*?)"', html, re.S | re.M) self.get_house_info(house_url_list, build.bu_id, comm.co_id) except Exception as e: print( '楼栋错误,co_index={},url={}'.format( co_index, house_url), e) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
def analyzer_comm_url(self, comm_url_list): all_url = [] for i in comm_url_list: try: res = requests.get(i) html = res.content.decode('gbk') c = Comm(self.co_index) c.co_name = re.search('项目名称:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 项目名称 c.co_address = re.search('项目地址:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 项目地址 c.co_develops = re.search('开发商:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 开发商 c.co_build_size = re.search( '总建筑面积:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 建筑面积 c.co_land_type = re.search( '用地依据:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 土地使用证 c.co_all_house = re.search( '>总套数:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 总套数 c.area = re.search('所在区域:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 地区 area c.co_work_pro = re.search( '施工许可证:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 施工许可证 c.co_plan_pro = re.search( '建设工程规划许可证:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 规划许可证 c.insert_db() buildlist = re.findall('onmouseover.*?</TR>', html, re.S | re.M) url_list = [] for k in buildlist: try: b = Building(self.co_index) build_list = re.findall('<TD.*?>(.*?)</TD>', k, re.S | re.M) b.co_name = build_list[1] b.bu_num = build_list[2] b.bu_type = build_list[4] b.insert_db() house_url = re.findall('href="(.*?)"', k, re.S | re.M) for j in house_url: url_list.append( 'http://www.stfcj.gov.cn/stsite/ProjectList/' + j) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, i), e) all_url = all_url + url_list except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, i), e) return all_url
def comm_parse(self,co_name,co_addr,co_area,co_url): co_res = requests.get(co_url,headers=self.headers) co_res.encoding = 'gbk' con = co_res.text co = Comm(co_index) if re.search('开发商名称.*?;">(.*?)</',con,re.S|re.M): co.co_develops = re.search('开发商名称.*?;">(.*?)</',con,re.S|re.M).group(1) else: co.co_develops = None kfsid = re.search('kfsid=(\d+)',co_url).group(1) co.co_id = co_name+kfsid co.co_name = co_name co.co_address = co_addr co.area = co_area co.co_all_house = re.search('总套数.*?">(\d+) ',con,re.S|re.M).group(1) co.co_all_size = re.search('总面积.*?">(.*?) ',con,re.S|re.M).group(1) co.co_residential_size = re.search('住宅面积.*?">(.*?) ',con,re.S|re.M).group(1) co.insert_db() num = 1 while True: pre_url = co_url + "&ypage=" + str(num) # 预售翻页 pre_res = requests.get(pre_url,headers=self.headers) pre_con = pre_res.content.decode('gbk') pre_html = etree.HTML(pre_con) if pre_html.xpath("//table[@id='preselltable1']//tr[@bgcolor='white']"): pre_list = pre_html.xpath("//table[@id='preselltable1']//tr[@bgcolor='white']") num += 1 for pre in pre_list: bu_url = pre.xpath("./td[4]/a/@href")[0] if 'user_Presell' in bu_url: self.bu_parse(bu_url,co.co_id,co_url) else: continue else: break while True: sell_url = co_url + "&page=" + str(num) # 现售翻页 sell_res = requests.get(sell_url, headers=self.headers) sell_con = sell_res.content.decode('gbk') sell_html = etree.HTML(sell_con) if sell_html.xpath("//table[@id='selltable1']//tr[@bgcolor='white']"): sell_list = sell_html.xpath("//table[@id='selltable1']//tr[@bgcolor='white']") num += 1 for sell in sell_list: ho_url = sell.xpath("./td/a/@href")[0] if 'user_sell' in ho_url: bu_id = re.search('ID=(.*?)&',ho_url).group(1) self.house_parse(ho_url,co.co_id,bu_id) else: continue else: break
def comm_parse(self, comm_res, co_id): comm_dict = json.loads(comm_res.content.decode()) comm_info = comm_dict["jbxx"] co = Comm(co_index) co.co_id = co_id co.co_name = comm_info["XMMC"] co.co_address = comm_info["XMDZ"] co.co_develops = comm_info["FDCKFQYMC"] co.area = comm_info["QXH_MC"] co.co_build_size = comm_info["ZJZMJ"] co.co_size = comm_info["ZDMJ"] co.insert_db()
def get_comm_detail(self, detail_url, area): try: comm = Comm(co_index) comm_detail_url = 'http://www.yfci.gov.cn:8080/HousePresell/' + detail_url response = requests.get(comm_detail_url, headers=self.headers) html = response.text comm.co_develops = re.search('id="kfsmc".*?<a.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_name = re.search('id="PresellName".*?<a.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('id="HouseRepose".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('id="PresellArea".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_all_house = re.search('id="djrqtd".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_land_use = re.search('id="landinfo".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_type = re.search('id="zczjtd".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale = re.search('id="bookid".*?<a.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale_date = re.search('id="FZDatebegin".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_open_time = re.search('id="kpdate".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_id = re.search('FD=(.*?)&', detail_url, re.S | re.M).group(1) comm.area = area comm.insert_db() build_html = re.search('id="donglist".*?</table>', html, re.S | re.M).group() build_info_list = re.findall('<tr.*?</tr>', build_html, re.S | re.M) for i in build_info_list: build = Building(co_index) build.co_id = comm.co_id build.bu_address = re.search('<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_num = re.search('<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_floor = re.search('<td.*?<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1) build.bu_id = re.search("LID=(.*?)$", house_url, re.S | re.M).group(1) build.insert_db() self.get_house_info(house_url, comm.co_id, build.bu_id) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_detail_url), e)
def comm(self, tag): co = Comm(co_index) co.co_name = tag.xpath("./td[@width='143']/a/text()")[0] co.area = tag.xpath("./td[@width='184']/text()")[0] co.co_develops = tag.xpath("./td[@width='192']/text()")[0] co_id = tag.xpath("./td/a/@href")[0] co.co_id = re.search('mmcid=(\d+)&', co_id).group(1) co.co_open_time = tag.xpath("./td[@width='95']/text()")[0] buid_all_url = "http://www.syfc.com.cn" + co_id co.insert_db() global count count += 1 print(count) return buid_all_url, co.co_id
def get_comm_info(self, comm_url_list): for comm_url in comm_url_list: comm_detail = "http://xx.yyfdcw.com" + comm_url try: comm_res = requests.get(comm_detail, headers=self.headers) except Exception as e: print("co_index={},小区详情页无法访问".format(co_index), e) continue con = comm_res.text comm = Comm(co_index) comm.co_id = re.search('ID=(\d+)', con).group(1) comm.co_name = re.search('lpname">.*?<h2>(.*?)</h2', con, re.S | re.M).group(1) comm.co_develops = re.search('开发商:.*?Kfs">(.*?)</span', con, re.S | re.M).group(1) comm.co_green = re.search('绿化率:.*?Lhl">(.*?)</span', con, re.S | re.M).group(1) comm.area = re.search('区域:.*?Name">(.*?)</span', con, re.S | re.M).group(1) comm.co_address = re.search('位置:</b>(.*?)</li', con, re.S | re.M).group(1) comm.co_build_size = re.search('建筑面积:.*?l5">(.*?)</span', con, re.S | re.M).group(1) comm.co_all_house = re.search('总户数:.*?hs">(.*?)</span', con, re.S | re.M).group(1) comm.co_plan_useland = re.search('用地.*?l4">(.*?)</span', con, re.S | re.M).group(1) comm.co_plan_project = re.search('工程.*?l3">(.*?)</span', con, re.S | re.M).group(1) comm.co_build_type = re.search('楼盘类型.*?Type">(.*?)</span', con, re.S | re.M).group(1) comm.co_all_size = re.search('占地面积.*?mianji">(.*?)</span', con, re.S | re.M).group(1) comm.co_land_use = re.search('使用权证.*?l1">(.*?)</span', con, re.S | re.M).group(1) comm.insert_db() try: build_list = re.findall( '<td align="center">.*?<a href="(.*?)"', con, re.S | re.M) if len(build_list) > 0: self.get_build_info(build_list, comm.co_id) else: print("co_index={},小区co_id={}没有楼栋".format( co_index, comm.co_id)) continue except: print("co_index={},小区co_id={}没有楼栋".format( co_index, comm.co_id)) continue
def comm_parse(self, url_list): # 小区信息解析 co = Comm(co_index) # url_list = Queue() while True: url, area, type = url_list.get() try: res = requests.get(url, headers=self.headers) except Exception as e: print("co_index={},小区详情页无法访问".format(co_index), e) continue con = res.text co.area = area co.co_type = type co.co_id = re.search('id=(\d+)', url).group(1) co.co_develops = re.search('企业名称.*?> (.*?)<', con, re.S | re.M).group(1) co.co_name = re.search('项目名称.*?> (.*?)<', con, re.S | re.M).group(1) co.co_address = re.search('项目座落.*?> (.*?)<', con, re.S | re.M).group(1) co.co_use = re.search('房屋用途.*?> (.*?)<', con, re.S | re.M).group(1) try: co.co_pre_sale = re.search('许可证号.*?> (.*?)<', con, re.S | re.M).group(1) except: co.co_pre_sale = None new_url = "http://ys.tyfdc.gov.cn/Firsthand/tyfc/publish/p/ProjInfo.do?propid=" + co.co_id a_res = requests.get(new_url, headers=self.headers) a_con = a_res.text co.co_build_size = re.search('建筑面积.*?> (.*?)<', a_con, re.S | re.M).group(1) co.co_all_house = re.search('销售套数.*?> (.*?)<', a_con, re.S | re.M).group(1) co.co_green = re.search('绿化率.*?> (.*?)<', a_con, re.S | re.M).group(1) co.co_build_start_time = re.search('开工日期.*?> (.*?)<', a_con, re.S | re.M).group(1) co.co_build_end_time = re.search('竣工日期.*?> (.*?)<', a_con, re.S | re.M).group(1) co.co_volumetric = re.search('容积率.*?> (.*?)<', a_con, re.S | re.M).group(1) co.insert_db() global count count += 1 print(count) try: self.build_parse(co.co_id, ) except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e)
def get_comm_info(self, comm_list): for i in comm_list: try: comm = Comm(co_index) comm_url = 'http://www.lhfdc.gov.cn/templets/lh/aspx/hpms/ProjectInfo.aspx?code=' + str( i) response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.search('id="PROJECT_XMMC1">(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('id="PROJECT_XMDZ">(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('id="PROJECT_KFQY_NAME1">(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('id="PROJECT_SZQY">(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('id="PROJECT_GHZJZMJ">(.*?)<', html, re.S | re.M).group(1) comm.co_volumetric = re.search('id="PROJECT_RJL">(.*?)<', html, re.S | re.M).group(1) comm.co_build_start_time = re.search( 'id="PROJECT_JHKGRQ">(.*?)<', html, re.S | re.M).group(1) comm.co_build_end_time = re.search( 'id="PROJECT_JHJGRQ">(.*?)<', html, re.S | re.M).group(1) house_all = re.search('id="lbYsZZTs">(.*?)<', html, re.S | re.M).group(1) house_all_a = re.search('id="lbWsZZTs">(.*?)<', html, re.S | re.M).group(1) bus_all = re.search('id="lbWsSYTs">(.*?)<', html, re.S | re.M).group(1) bus_all_a = re.search('id="lbYsSYTs">(.*?)<', html, re.S | re.M).group(1) comm.co_all_house = int(house_all_a) + int(house_all) + int( bus_all) + int(bus_all_a) area_size_a = re.search('id="lbYsZZMj">(.*?)<', html, re.S | re.M).group(1) area_size_b = re.search('id="lbWsZZMj">(.*?)<', html, re.S | re.M).group(1) area_size_c = re.search('id="lbWsSYMj">(.*?)<', html, re.S | re.M).group(1) area_size_d = re.search('id="lbYsSYMj">(.*?)<', html, re.S | re.M).group(1) comm.co_size = float(area_size_a) + float(area_size_b) + float( area_size_c) + float(area_size_d) comm.co_id = str(i) comm.insert_db() self.get_build_info(comm.co_id) except Exception as e: print('小区 错误,co_index={},url={}'.format(co_index, comm_url), e)
def get_comm_info(self, co_url_list): for i in co_url_list: comm = Comm(co_index) comm_url = 'http://183.63.60.194:8808/public/web/ysxm?ysxmid=' + i try: time.sleep(1) response = self.s.get(comm_url, headers=self.headers) html = response.text comm.co_id = re.search('ysxmid=(.*?)$', comm_url).group(1) comm.co_develops = re.findall('kfsmc.*?<a.*?>(.*?)<', html, re.S | re.M)[0] comm.co_name = re.findall('PresellName.*?<a.*?>(.*?)<', html, re.S | re.M)[0] comm.co_address = re.findall('ItemRepose.*?>(.*?)<', html, re.S | re.M)[0] comm.co_build_size = re.findall('PresellArea.*?>(.*?)<', html, re.S | re.M)[0] comm.co_all_house = re.findall('djrqtd.*?>(.*?)<', html, re.S | re.M)[0] comm.co_land_use = re.findall('landinfo.*?>(.*?)<', html, re.S | re.M)[0] comm.co_type = re.findall('zczjtd.*?>(.*?)<', html, re.S | re.M)[0] comm.area = re.findall('FQ.*?>(.*?)<', html, re.S | re.M)[0] comm.co_pre_sale_date = re.findall('FZDatebegin.*?>(.*?)<', html, re.S | re.M)[0] comm.co_pre_sale = re.findall('bookid.*?<a.*?>(.*?)<', html, re.S | re.M)[0] comm.insert_db() bu_address_list = re.findall( 'onmouseout.*?center.*?center">(.*?)<', html, re.S | re.M) bu_num_list = re.findall( 'onmouseout.*?center.*?center.*?center">(.*?)<', html, re.S | re.M) bu_floor_list = re.findall( 'onmouseout.*?center.*?center.*?center.*?center">(.*?)<', html, re.S | re.M) bu_url_list = re.findall('onmouseout.*?href="(.*?)"', html, re.S | re.M) self.get_build_info(bu_address_list, bu_num_list, bu_floor_list, bu_url_list, comm.co_id) global count count += 1 print(count) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)