def start_crawler(self): b = AllListUrl( first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='共.*?Count">(\d+)</span>页', ) page = b.get_page_count() data = {'__EVENTTARGET': 'navigate$LnkBtnGoto'} for i in range(1, int(page) + 1): if i == 1: res = requests.get(self.start_url, headers=self.headers) con = res.content.decode() html = etree.HTML(con) view_state = html.xpath("//input[@id='__VIEWSTATE']/@value")[0] valid = html.xpath( "//input[@id='__EVENTVALIDATION']/@value")[0] data['__VIEWSTATE'] = view_state data['__EVENTVALIDATION'] = valid self.comm_list(html) else: data['navigate$txtNewPageIndex'] = i res = requests.post(self.start_url, data=data, headers=self.headers) con = res.content.decode() html = etree.HTML(con) view_state = html.xpath("//input[@id='__VIEWSTATE']/@value")[0] valid = html.xpath( "//input[@id='__EVENTVALIDATION']/@value")[0] data['__VIEWSTATE'] = view_state data['__EVENTVALIDATION'] = valid self.comm_list(html)
def start_crawler(self): b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='green1">1/(.*?)<', ) page = b.get_page_count() for i in range(1, int(page) + 1): all_url = 'http://www.tmsf.com/newhouse/property_searchall.htm?&page=' + str( i) while True: try: response = requests.get(all_url, headers=self.headers, timeout=10) if response.status_code is 200: break except Exception as e: print( '小区列表页加载不出来,co_index={},url={}'.format( co_index, all_url), e) html = response.text comm_url_list = re.findall( 'build_word01" onclick="toPropertyInfo\((.*?)\);', html, re.S | re.M) self.get_comm_info(comm_url_list)
def get_comm_info(self, all_url_list): for i in all_url_list: res = requests.get(i, headers=self.headers) con = res.content.decode('gbk') current_url_list = re.findall( '<a href="(.*?)" target="_blank">查看详细', con) for current_url in current_url_list: co_id = re.search('id=(\d+)', current_url).group(1) res = requests.get(current_url, headers=self.headers) con = res.content.decode('gbk') if '尾页' in con: b = AllListUrl(first_page_url=current_url, page_count_rule='总页数.*?<b>(\d+)</b>', analyzer_type='regex', request_method='get', headers=self.headers, encode='gbk') page_count = b.get_page_count() for i in range(1, int(page_count) + 1): url = current_url + "&page=" + str(i) comm_page = requests.get(url, headers=self.headers) comm_con = comm_page.content.decode('gbk') self.comm_info_parse(comm_con, co_id) else: self.comm_info_parse(con, co_id)
def start_crawler(self): b = AllListUrl(first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='总共<b>(\d+)<', ) page = b.get_page_count() formdata = {} comm_url_list = [] for i in range(1, int(page) + 1): res = requests.post(self.start_url, data=formdata,) con = res.content.decode('gbk') con = etree.HTML(con) view_state = con.xpath("//input[@name='__VIEWSTATE']/@value")[0] valid = con.xpath("//input[@name='__EVENTVALIDATION']/@value")[0] view_state = parse.quote_plus(view_state,encoding='gbk') valid = parse.quote_plus(valid,encoding='gbk') formdata["__VIEWSTATE"] = view_state # 保存当前页的信息作为下一页请求参数 formdata["__EVENTVALIDATION"] = valid formdata["__EVENTTARGET"] = 'AspNetPager1' formdata["__VIEWSTATEGENERATOR"] = "248CD702" formdata["__EVENTARGUMENT"] = str(i+1) formdata["AspNetPager1_input"] = str(i) url_list = con.xpath("//tr[@bgcolor='#F5F9FC']/td[@bgcolor='white']/a/@href") comm_url_list.extend(url_list) self.comm_info(comm_url_list)
def start(self): b = AllListUrl( first_page_url=self.url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='pageTotal = (.*?);', ) page = b.get_page_count() for i in range(1, int(page) + 1): url = 'http://fsfc.fsjw.gov.cn/search/index.do?p=' + str(i) response = requests.get(url, headers=self.headers) html = response.text tree = etree.HTML(html) comm_url_list = tree.xpath( '//*[@id="content"]/div[2]/div[1]/dl/dd/h3/a/@value') for i in comm_url_list: comm = Comm(co_index) url = 'http://fsfc.fsjw.gov.cn/hpms_project/roomView.jhtml?id=' + i try: response = requests.get(url, headers=self.headers) except Exception as e: print(e) print("co_index={},小区详情页{}请求失败".format(co_index, url)) continue self.get_comm_info(url, response, comm)
def start_crawler(self): for region in self.region.items(): region_code = region[0] region_name = region[1] url = self.start_url + region_code + '.html' b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='共(\d+)页>', ) page = b.get_page_count() for i in range(1, int(page) + 1): new_url = url + "?page=" + str(i) res = requests.get(new_url, headers=self.headers) html = etree.HTML(res.text) co_list = html.xpath("//dl[@class='spf_lp_searchlist bg1']") for co in co_list: comm = Comm(co_index) co_url = co.xpath("./dt/h4/a/@href")[0] comm.co_name = co.xpath("./dt/h4/a/text()")[0] comm.co_address = co.xpath(".//address/text()")[0] comm.co_id = re.search('\d+', co_url).group(0) comm.co_develops = co.xpath( "./dd[@class='dev']/a/text()")[0] comm.co_plan_pro = co.xpath("./dt/h4/span/text()")[0] comm.co_type = co.xpath(".//p/span[2]/text()")[0] comm.area = region_name comm.insert_db() detail_url = "http://www.zstmsf.com" + co_url self.bu_parse(detail_url, comm.co_id)
def start_crawler(self): b = AllListUrl(first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='共(\d+)页', ) page = b.get_page_count() for i in range(1,int(page)+1): url = self.start_url + '?pageIndex=2' + str(page) page_res = requests.get(url,headers=self.headers) html = etree.HTML(page_res.text) comm_info_list = html.xpath("//ul/li/div") for comm_info in comm_info_list: try: co = Comm(co_index) co.co_name = comm_info.xpath("./p/a/text()")[0] deve = comm_info.xpath("./p[2]/text()")[0] addr = comm_info.xpath("./p[3]/text()")[0] co.co_develops = re.search('开发商:(.*)',deve).group(1) co.co_address = re.search('楼盘地址.*?:(.*)',addr).group(1) comm_url = comm_info.xpath("./p/a/@href")[0] co.co_id = re.search('projectId=(\d+)',comm_url).group(1) co.insert_db() co_url = 'http://www.bdfdc.net' + comm_url co_res = requests.get(co_url,headers=self.headers) time.sleep(5) bu_html = etree.HTML(co_res.text) bu_url_list = bu_html.xpath("//div[@style]/a")[1:] except Exception as e: # log.error("小区信息错误{}".format(e)) print("小区信息错误{}".format(e)) continue self.bu_info(bu_url_list,co.co_id)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='1/(.*?)页', headers=self.headers ) page = b.get_page_count() for i in range(0, int(page) + 1): page_url = 'http://www.fjlyfdc.com.cn/House/Link/YSXXCX.cshtml?pagenumber=' + str(i) response = requests.get(page_url) html = response.text comm_url_list = re.findall('class="c".*?href="(.*?)"', html, re.S | re.M) self.get_comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='>>>.*?page=(.*?)"', headers=self.headers) page = b.get_page_count() for i in range(1, int(page) + 1): all_page_url = url + '&page=' + str(i) response = requests.get(all_page_url, headers=self.headers) html = response.text tree = etree.HTML(html) comm_url_list = tree.xpath('//div[@class="info"]/h3/a/@href') self.get_comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl( first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='共(\d+)页', ) page = b.get_page_count() for i in range(1, int(page + 1)): url = self.start_url + '?page2=' + str(i) res = requests.get(url, headers=self.headers) html = etree.HTML(res.content.decode('gbk')) comm_url_list = html.xpath("//td/a[@target]/@href") self.comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule=' 1/(.*?)页', headers=self.headers ) page = b.get_page_count() for i in range(int(page)): all_page_url = url + '?page=' + str(i) response = requests.get(all_page_url, headers=self.headers) html = response.text comm_detail_url_list = re.findall('(/House/ProjectInfo\?ProjectId=.*?)"', html, re.S | re.M) self.get_comm_info(comm_detail_url_list)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule=' 1/(.*?)页', headers=self.headers) page = b.get_page_count() for i in range(int(page)): all_page_url = 'http://www.fjnpfdc.com/House/ListCanSell?page=' + str( i) response = requests.get(all_page_url) html = response.text comm_url_list = re.findall('<tr align="center">.*?<a href="(.*?)"', html, re.S | re.M) self.get_comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl( first_page_url=self.url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='" >(\d+)</a> ', ) page = b.get_page_count() for i in range(1, int(page) + 1): #翻页 url = "http://www.hufdc.com/presell.jspx?pageno=" + str(i) response = requests.get(url, headers=self.headers) url_html = etree.HTML(response.text) self.comm_parse(url_html)
def start_crawler(self): all_url = AllListUrl(first_page_url=url, page_count_rule='总页数.*?<b>(.*?)</b>', analyzer_type='regex', request_method='get', headers=self.headers, encode='gbk') # 所有分页 page_count = all_url.get_page_count() all_url_list = [] for i in range(1, page_count + 1): all_url_list.append( 'http://www.gyfc.net.cn/2_proInfo/index.aspx/?page=' + str(i)) print(all_url_list) self.get_comm_info(all_url_list)
def start_crawler(self): b = AllListUrl( first_page_url=self.url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='共.*?Count">(\d+)</span>页', ) page = b.get_page_count() formdata = {} for i in range(1, int(page) + 1): formdata["__EVENTTARGET"] = "navigate$LnkBtnGoto" formdata["navigate$txtNewPageIndex"] = i try: res = requests.post(self.url, headers=self.headers) except Exception as e: print("co_index={},第{}页翻页失败".format(co_index, i)) print(e) continue con = etree.HTML(res.text) formdata["__VIEWSTATE"] = con.xpath( "//input[@id='__VIEWSTATE']/@value")[0] formdata["__EVENTVALIDATION"] = con.xpath( "//input[@id='__EVENTVALIDATION']/@value")[0] bu_url_list = con.xpath("//td[@style='width:13%']/a/@href") bu_pre = con.xpath("//td[@style='width:13%']/a/text()") bu_dev = con.xpath("//td[@style='width:24%']/text()") co_name = con.xpath("//td[@style='width:15%']/text()") # print(i) for index in range(len(bu_url_list)): bu_detail = "http://www.hcsfcglj.com/Templets/BoZhou/aspx/" + bu_url_list[ index] bu_pre_sale = bu_pre[index] bo_develops = bu_dev[index] bu_co_name = co_name[index] try: bu_res = requests.get(bu_detail, headers=self.headers) except Exception as e: print("co_index={},楼栋{}无法访问".format(co_index, bu_detail)) print(e) continue bu_con = bu_res.text self.get_build_info(bu_pre_sale, bo_develops, bu_co_name, bu_con) self.get_house_info(bu_con)
def start_crawler(self): b = AllListUrl( first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='cite>.*?/(.*?)页<', ) page = b.get_page_count() for i in range(1, int(page) + 1): url = self.start_url + '&page=' + str(i) res = requests.get(url, headers=self.headers) comm_url_list = re.findall("window.open\('(.*?)'\)", res.text, re.S | re.M) self.comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='> ..<a.*?>(.*?)<', headers=self.headers) page = b.get_page_count() for i in range(1, int(page) + 1): page_url = "http://zfbzj.baotou.gov.cn/index.php?m=content&c=permit&a=init&page=" + str( i) response = requests.get(page_url, headers=self.headers) html = response.text comm_url_list = re.findall( 'href="(http://zfbzj.baotou\.gov\.cn/index\.php\?m=content&c=permit&a=show&id=.*?)".*?http://zfbzj', html, re.S | re.M) self.get_comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='> ..<.*?>(.*?)<', headers=self.headers) page = b.get_page_count() for i in range(1, int(page) + 1): page_all_url = 'http://www.dzfgj.com/index.php?m=content&c=index&a=lists&catid=61&page=' + str( i) response = requests.get(page_all_url, headers=self.headers) html = response.text comm_html = re.search('<tbody>.*?</tbody>', html, re.S | re.M).group() comm_info_list = re.findall('<tr>.*?</tr>', comm_html, re.S | re.M) self.get_comm_info(comm_info_list)
def start_crawler(self): for region in self.region_list: region_url = self.start_url + region b = AllListUrl( first_page_url=region_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='1/(\d+)页', ) page = b.get_page_count() for i in range(1, int(page) + 1): url = region_url + "&pagenumber=" + str(i) res = requests.get(url, headers=self.headers) html = etree.HTML(res.text) url_list = html.xpath("//tr/td/a/@href") self.comm_parse(url_list, region)
def start_crawler(self): start_url = self.start_url + "searchSpf.jsp?nowPage=1" b = AllListUrl( first_page_url=start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='/(\d+)页', ) page = b.get_page_count() for i in range(1, int(page) + 1): url = self.start_url + "searchSpf.jsp?nowPage=" + str(i) res = requests.get(url, headers=self.headers) html = etree.HTML(res.content.decode()) url_list = html.xpath("//b/a/@href") for comm_temp in url_list: try: comm_url = self.start_url + comm_temp.replace( "./xmxxmainNew", 'xmxx/xmjbxx') com_res = requests.get(comm_url, headers=self.headers) con = com_res.content.decode('gbk') co = Comm(co_index) co.co_id = re.search('Id_xmxq=(.*)', comm_temp).group(1) co.co_name = re.search('3a3a3a">(.*?)</b', con).group(1) co.co_address = re.search('项目地址.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_develops = re.search('开 发 商.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_all_house = re.search('总 套 数.*?<td>(.*?)</td', con, re.S | re.M).group(1) co.co_green = re.search('绿 化 率.*?<td>(.*?)</td', con, re.S | re.M).group(1) co.co_volumetric = re.search('容 积 率.*?<td>(.*?)</td', con, re.S | re.M).group(1) try: co.co_build_size = re.search('建设规模.*?" >(.*?)平', con, re.S | re.M).group(1) except: co.co_build_size = None co.insert_db() except Exception as e: log.error('{}小区错误{}'.format(comm_temp, e)) self.build_parse(co.co_id)
def start_crawler(self): b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='green1">1/(.*?)<', ) page = b.get_page_count() for i in range(1, int(page) + 1): all_url = 'http://tz.tmsf.com/newhouse/property_searchall.htm' data = {'keytype': 1, 'page': page} response = requests.post(all_url, data=data, headers=self.headers) html = response.text comm_url_list = re.findall( '<div class="build_txt">.*?<a href="(.*?)"', html, re.S | re.M) for i in comm_url_list: self.get_comm_detail(i)
def start_crawler(self): b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='页次:<b><font color=red>1</font></b>/<b>(.*?)<', headers=self.headers) page = b.get_page_count() for i in range(1, int(page) + 1): page_url = 'http://123.131.127.13/xy/dzlist.asp?page=' + str(i) time.sleep(5) response = requests.get(page_url, headers=self.headers) html = response.content.decode('gbk') comm_list_html = re.search('项目电子手册列表.*?<table(.*?)</table>', html, re.S | re.M).group(1) comm_html_list = re.findall('<tr>(.*?)</tr>', comm_list_html, re.S | re.M)[1:] self.get_comm_info(comm_html_list)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='共(.*?)页', headers=self.headers) page = b.get_page_count() for i in range(1, int(page) + 1): if i is 1: all_page_url = 'http://www.xyfcj.com/html/jplp/index.html' else: all_page_url = 'http://www.xyfcj.com/html/jplp/index_' + str( i) + '.html' response = requests.get(all_page_url, headers=self.headers) html = response.text comm_url_list = re.findall( '<a style="COLOR: #000000" target="_blank" href="(.*?)"', html, re.S | re.M) self.get_comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='strong>1/(.*?)<', headers=self.headers) page = b.get_page_count() for i in range(1, int(page) + 1): index_url = 'http://www.kmhouse.org/moreHousePriceList.asp?page=' + str( i) try: response = requests.get(url=index_url, headers=self.headers) html = response.content.decode('gbk') comm_url_list = re.findall("cellspacing='3'.*?<a href='(.*?)'", html) self.get_comm_info(comm_url_list) except Exception as e: print('page页错误,co_index={},url={}'.format(co_index, index_url), e)
def start_crawler(self): b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='pg.pageCount = (.*?);', ) page = b.get_page_count() all_url_list = [] for i in range(1, int(page) + 1): all_url = 'http://www.gafdc.cn/newhouse/houselist.aspx?hou=0-0-0-0-0-0-&page=' + str( i) comm_url_list = self.get_comm_url(all_url) all_url_list += comm_url_list # 遍历所有小区url for i in all_url_list: comm_url = 'http://www.gafdc.cn/newhouse/' + str( i.replace('index', 'base')) try: self.get_comm_info(comm_url) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_url), e) all_build_url_list = [] for i in all_url_list: build_url = 'http://www.gafdc.cn/newhouse/' + str( i.replace('index', 'table')) house_url_list = self.get_build_info(build_url) if house_url_list: all_build_url_list += house_url_list else: print('楼栋错误,此小区没有楼栋,co_index={},url={}'.format( co_index, build_url)) all_house_url_list = [] form_data_list = [] for i in all_build_url_list: house_url = 'http://www.gafdc.cn/newhouse/GetBuildTableByAjax.ashx' data = {'itemRecord': i[0], 'houseCode': i[1]} all_house_url_list.append(house_url) form_data_list.append(data) self.get_house_info(form_data_list)
def start_crawler(self): b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='共(.*?)页', ) page = b.get_page_count() for i in range(1, int(page) + 1): all_page_url = url + '&Page=' + str(i) p = ProducerListUrl(page_url=all_page_url, request_type='get', encode='gbk', analyzer_rules_dict=None, current_url_rule="eval\('openBldg\((.*?)\)", analyzer_type='regex', headers=self.headers) comm_url_list = p.get_current_page_url() self.get_build_info(comm_url_list)
def start_crawler(self): b = AllListUrl( first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='页数.*?/(\d+)', ) page = b.get_page_count() for i in range(1, int(page) + 1): formdata = { 'page': i, 'keytype': 1, } res = requests.post(self.start_url, data=formdata, headers=self.headers) html = etree.HTML(res.text) url_list = html.xpath("//h3/a") self.co_parse(url_list)
def start_crawler(self): b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='下页</a>.*?page=(.*?)"', ) page = b.get_page_count() for i in range(1, int(page) + 1): try: all_page_url = url + '?page=' + str(i) response = requests.get(all_page_url, headers=self.headers) html = response.text comm_url_list = re.findall('项目名称:.*?href="(.*?)"', html, re.S | re.M) self.get_comm_info(comm_url_list) except Exception as e: print( 'page页面错误,co_index={},url={}'.format( co_index, all_page_url), e)
def start_crawler(self): b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='下一页.*?page=(.*?)"', ) page = b.get_page_count() for i in range(1, int(page) + 1): all_page_url = 'http://www.sxczfdc.com/pubinfo/More_xm.aspx?page=' + str( i) response = requests.get(all_page_url, headers=self.headers) html = response.text comm_url_list = re.findall( 'style="background-color: .*?(Pub_lpxx.aspx\?DevProjectId=.*?)"', html, re.S | re.M) area_list = re.findall( 'style="background-color: .*?center">(.*?)<', html, re.S | re.M) self.get_comm_info(comm_url_list, area_list)
def start_crawler(self): b = AllListUrl( first_page_url=self.url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='1/(\d+)页', ) page = b.get_page_count() formdata = {} for i in range(1, page + 1): res = requests.post(self.url, data=formdata, headers=self.headers) html = etree.HTML(res.text) formdata["__EVENTTARGET"] = "Pager" formdata["__EVENTARGUMENT"] = str(i + 1) formdata["__VIEWSTATEGENERATOR"] = "1D9D200C" formdata["__VIEWSTATE"] = html.xpath( "//input[@id='__VIEWSTATE']/@value")[0] comm_url_list = html.xpath("//h3/a/@href") self.get_comm_info(comm_url_list)