def start_crawler(self): res = requests.get(self.start_url, headers=self.headers) html = etree.HTML(res.text) comm_url_list = html.xpath("//div[@class='post']//a/@href") for comm_url in comm_url_list: try: url = 'http://www.ggsfcw.com/' + comm_url comm_res = requests.get(url, headers=self.headers) com_html = etree.HTML(comm_res.text) comm = Comm(co_index) comm.co_name = re.search('<h3.*?">(.*?)</', comm_res.text).group(1) comm.co_id = re.search('n=(\d+)', comm_res.text).group(1) comm.co_address = re.search('地址.*?">(.*?)</', comm_res.text).group(1) comm.area = re.search('区县.*?">(.*?)</', comm_res.text).group(1) comm.co_develops = re.search('开发商.*?">(.*?)</', comm_res.text).group(1) comm.co_use = re.search('规划用途.*?">(.*?)</', comm_res.text).group(1) comm.insert_db() except Exception as e: log.error("小区信息错误", e) continue bu_list = com_html.xpath("//div[@id='MainContent_divResult']/a") self.build_info(bu_list, comm.co_id)
def get_comm_detail(self, comm_list): for i in comm_list: try: comm = Comm(co_index) comm_url = 'http://house.bffdc.gov.cn/public/project/' + i response = requests.get(comm_url) html = response.text comm.co_name = re.search('PROJECT_XMMC">(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('PROJECT_KFQY_NAME">(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('PROJECT_XMDZ">(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('PROJECT_SZQY">(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale = re.search('YSXKZH">(.*?)<', html, re.S | re.M).group(1) comm.insert_db() build_info = re.search('id="buildInfo".*?value="(.*?)"', html, re.S | re.M).group(1) build_url_list = build_info.split(';;') self.get_build_info(build_url_list, comm.co_name) global count count += 1 print(count) except Exception as e: print(e)
def co_parse(self, url_list): for url in url_list: try: co_url = url.xpath("./@href")[0] new_url = "http://tmsf.qzfdcgl.com" + co_url co_res = requests.get(new_url, headers=self.headers) con = co_res.text co = Comm(co_index) co.co_id = re.search('property_(.*?)_info', co_url).group(1) co.co_name = re.search('楼盘名称:</span>(.*)', con).group(1) co.co_develops = re.search('项目公司:</span>(.*)', con).group(1) co.co_address = re.search('物业地址:</span>(.*?)</p', con, re.S | re.M).group(1) co.area = re.search('所属城区:</span>(.*)', con).group(1) co.insert_db() sid = re.search('property_(\d+)_', co_url).group(1) propertyid = re.search('(\d+)_info', co_url).group(1) bu_url = new_url.replace('info', 'price') res = requests.get(bu_url, headers=self.headers) bu_html = etree.HTML(res.text) bu_idlist = bu_html.xpath("//dd[@id='building_dd']/a") except: continue for bu_ in bu_idlist[1:]: id = bu_.xpath("./@id")[0] bu_id = re.search('.*?(\d+)', id).group(1) bu = Building(co_index) bu.bu_id = bu_id bu.co_id = co.co_id bu.bu_num = bu_.xpath("./text()")[0] bu.insert_db() self.house_parse(bu_id, co.co_id, sid, propertyid)
def comm_info(self, co_develops, co_pre_sale, co_name, co_pre_sale_date, sid): co = Comm(co_index) co.co_pre_sale = co_pre_sale co.co_id = sid co.co_name = co_name co.co_pre_sale_date = co_pre_sale_date co.co_develops = co_develops co.insert_db()
def get_data_obj(self, analyzer, co_index): if analyzer == 'comm': return Comm(co_index) elif analyzer == 'build': return Building(co_index) elif analyzer == 'house': return House(co_index)
def get_comm_info(self, comm_info_list): for i in comm_info_list: try: comm = Comm(co_index) comm.co_name = re.search('<td>(.*?)</td>', i, re.S | re.M).group(1) comm.co_all_house = re.search('<td.*?<td>(.*?)</td>', i, re.S | re.M).group(1) comm.co_all_size = re.search('<td.*?<td.*?<td>(.*?)</td>', i, re.S | re.M).group(1) comm.insert_db() except Exception as e: print('小区错误,co_index={},html_str={}'.format(co_index, i), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = 'http://www.fjnpfdc.com/House/' + i comm.co_develops = '公司名称:.*?<td.*?>(.*?)<' comm.co_pre_sale = '预售许可证:.*?<td.*?>(.*?)<' comm.co_name = '项目名称:.*?<td.*?>(.*?)<' comm.co_address = '项目坐落:.*?<td.*?>(.*?)<' comm.co_use = '规划用途:.*?<td.*?>(.*?)<' comm.co_build_size = '建筑面积:.*?<td.*?>(.*?)<' comm.co_id = 'ProjectId=(.*?)&' p = ProducerListUrl( page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), current_url_rule="<a href='(BuildingInfo.*?)'", analyzer_type='regex', headers=self.headers) build_url_list = p.get_details() self.get_build_info(build_url_list) except Exception as e: print("co_index={},小区{}错误".format(co_index, i), e)
def get_comm_info(self, all_html_list): for html in all_html_list: try: comm_info_paper_list = re.findall('<tr>.*?</tr>', html, re.S | re.M) for i in comm_info_paper_list[1:]: comm = Comm(co_index) comm.area = re.search('align="center">(.*?)<', i, re.S | re.M).group(1) comm.co_name = re.search( 'align="center".*?align="center".*?>(.*?)<', i, re.S | re.M).group(1) comm.co_address = re.search( 'align="center".*?align="center".*?align="center".*?title="(.*?)"', i, re.S | re.M).group(1) comm.co_all_house = re.search( 'align="center".*?align="center".*?align="center".*?align="center".*?>(.*?)<', i, re.S | re.M).group(1) comm.co_id = re.search('projectID=(.*?)&', i, re.S | re.M).group(1) comm.insert_db() self.get_build_info(comm.co_id) except Exception as e: print('解析错误,co_index={},方法:get_comm_info'.format(co_index), e)
def start(self): page = self.get_all_page() count = 0 for i in range(1, int(page) + 1): try: url = 'http://www.czfdc.gov.cn/spf/gs.php?pageid=' + str(i) response = requests.get(url, headers=self.headers) html = response.content.decode('gbk') tree = etree.HTML(html) comm_url_list = tree.xpath('//td[@align="left"]/a/@href') for j in comm_url_list: count += 1 print(count) comm = Comm(6) comm_url = 'http://www.czfdc.gov.cn/spf/' + j self.get_comm_info(comm_url, comm) except Exception as e: print('co_index={},翻页有问题,url={}'.format(self.co_index, url), e) continue