def start_crawler(self): for region in self.region.items(): region_code = region[0] region_name = region[1] url = self.start_url + region_code + '.html' b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='共(\d+)页>', ) page = b.get_page_count() for i in range(1, int(page) + 1): new_url = url + "?page=" + str(i) res = requests.get(new_url, headers=self.headers) html = etree.HTML(res.text) co_list = html.xpath("//dl[@class='spf_lp_searchlist bg1']") for co in co_list: comm = Comm(co_index) co_url = co.xpath("./dt/h4/a/@href")[0] comm.co_name = co.xpath("./dt/h4/a/text()")[0] comm.co_address = co.xpath(".//address/text()")[0] comm.co_id = re.search('\d+', co_url).group(0) comm.co_develops = co.xpath( "./dd[@class='dev']/a/text()")[0] comm.co_plan_pro = co.xpath("./dt/h4/span/text()")[0] comm.co_type = co.xpath(".//p/span[2]/text()")[0] comm.area = region_name comm.insert_db() detail_url = "http://www.zstmsf.com" + co_url self.bu_parse(detail_url, comm.co_id)
def start_crawler(self): b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='green1">1/(.*?)<', ) page = b.get_page_count() for i in range(1, int(page) + 1): all_url = 'http://www.tmsf.com/newhouse/property_searchall.htm?&page=' + str( i) while True: try: response = requests.get(all_url, headers=self.headers, timeout=10) if response.status_code is 200: break except Exception as e: print( '小区列表页加载不出来,co_index={},url={}'.format( co_index, all_url), e) html = response.text comm_url_list = re.findall( 'build_word01" onclick="toPropertyInfo\((.*?)\);', html, re.S | re.M) self.get_comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl( first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='共.*?Count">(\d+)</span>页', ) page = b.get_page_count() data = {'__EVENTTARGET': 'navigate$LnkBtnGoto'} for i in range(1, int(page) + 1): if i == 1: res = requests.get(self.start_url, headers=self.headers) con = res.content.decode() html = etree.HTML(con) view_state = html.xpath("//input[@id='__VIEWSTATE']/@value")[0] valid = html.xpath( "//input[@id='__EVENTVALIDATION']/@value")[0] data['__VIEWSTATE'] = view_state data['__EVENTVALIDATION'] = valid self.comm_list(html) else: data['navigate$txtNewPageIndex'] = i res = requests.post(self.start_url, data=data, headers=self.headers) con = res.content.decode() html = etree.HTML(con) view_state = html.xpath("//input[@id='__VIEWSTATE']/@value")[0] valid = html.xpath( "//input[@id='__EVENTVALIDATION']/@value")[0] data['__VIEWSTATE'] = view_state data['__EVENTVALIDATION'] = valid self.comm_list(html)
def start_crawler(self): b = AllListUrl( first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='总共<b>(\d+)<', ) page = b.get_page_count() formdata = {} comm_url_list = [] for i in range(1, int(page) + 1): res = requests.post( self.start_url, data=formdata, ) con = res.content.decode('gbk') con = etree.HTML(con) view_state = con.xpath("//input[@name='__VIEWSTATE']/@value")[0] valid = con.xpath("//input[@name='__EVENTVALIDATION']/@value")[0] view_state = parse.quote_plus(view_state, encoding='gbk') valid = parse.quote_plus(valid, encoding='gbk') formdata["__VIEWSTATE"] = view_state # 保存当前页的信息作为下一页请求参数 formdata["__EVENTVALIDATION"] = valid formdata["__EVENTTARGET"] = 'AspNetPager1' formdata["__VIEWSTATEGENERATOR"] = "248CD702" formdata["__EVENTARGUMENT"] = str(i + 1) formdata["AspNetPager1_input"] = str(i) url_list = con.xpath( "//tr[@bgcolor='#F5F9FC']/td[@bgcolor='white']/a/@href") comm_url_list.extend(url_list) self.comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl(first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='共(\d+)页', ) page = b.get_page_count() for i in range(1,int(page)+1): url = self.start_url + '?pageIndex=2' + str(page) page_res = requests.get(url,headers=self.headers) html = etree.HTML(page_res.text) comm_info_list = html.xpath("//ul/li/div") for comm_info in comm_info_list: try: co = Comm(co_index) co.co_name = comm_info.xpath("./p/a/text()")[0] deve = comm_info.xpath("./p[2]/text()")[0] addr = comm_info.xpath("./p[3]/text()")[0] co.co_develops = re.search('开发商:(.*)',deve).group(1) co.co_address = re.search('楼盘地址.*?:(.*)',addr).group(1) comm_url = comm_info.xpath("./p/a/@href")[0] co.co_id = re.search('projectId=(\d+)',comm_url).group(1) co.insert_db() co_url = 'http://www.bdfdc.net' + comm_url co_res = requests.get(co_url,headers=self.headers) time.sleep(5) bu_html = etree.HTML(co_res.text) bu_url_list = bu_html.xpath("//div[@style]/a")[1:] except Exception as e: # log.error("小区信息错误{}".format(e)) print("小区信息错误{}".format(e)) continue self.bu_info(bu_url_list,co.co_id)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='第1页 / 共(.*?)页', ) page = b.get_page_count() for i in range(1, int(page) + 1): all_url = 'http://113.106.199.148/web/presale.jsp?page=' + str(i) try: self.get_comm_url(all_url) except Exception as e: print('page页错误,co_index={},url={}'.format(co_index, all_url), e)
def start_crawler(self): b = AllListUrl(first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='共(\d+)页', ) page = b.get_page_count() for i in range(1,int(page+1)): url = self.start_url + '?page2=' + str(i) res = requests.get(url,headers=self.headers) html = etree.HTML(res.content.decode('gbk')) comm_url_list = html.xpath("//td/a[@target]/@href") self.comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule=' 1/(.*?)页', headers=self.headers) page = b.get_page_count() for i in range(int(page)): all_page_url = url + '?page=' + str(i) response = requests.get(all_page_url, headers=self.headers) html = response.text comm_detail_url_list = re.findall( '(/House/ProjectInfo\?ProjectId=.*?)"', html, re.S | re.M) self.get_comm_info(comm_detail_url_list)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='>><.*?aspx\?p=(.*?)"', headers=self.headers) page = b.get_page_count() for i in range(1, int(page) + 1): all_page_url = url + '?p=' + str(i) response = requests.get(all_page_url, headers=self.headers) html = response.text tree = etree.HTML(html) comm_url_list = tree.xpath('//a[@class="sp_zi12c"]/@href') self.get_comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule=' 1/(.*?)页', headers=self.headers) page = b.get_page_count() for i in range(int(page)): all_page_url = 'http://www.fjnpfdc.com/House/ListCanSell?page=' + str( i) response = requests.get(all_page_url) html = response.text comm_url_list = re.findall('<tr align="center">.*?<a href="(.*?)"', html, re.S | re.M) self.get_comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl( first_page_url=self.url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='" >(\d+)</a> ', ) page = b.get_page_count() for i in range(1, int(page) + 1): #翻页 url = "http://www.hufdc.com/presell.jspx?pageno=" + str(i) response = requests.get(url, headers=self.headers) url_html = etree.HTML(response.text) self.comm_parse(url_html)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='1/(.*?)页', headers=self.headers) page = b.get_page_count() for i in range(0, int(page) + 1): page_url = 'http://www.fjlyfdc.com.cn/House/Link/YSXXCX.cshtml?pagenumber=' + str( i) response = requests.get(page_url) html = response.text comm_url_list = re.findall('class="c".*?href="(.*?)"', html, re.S | re.M) self.get_comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl( first_page_url=self.url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='共.*?Count">(\d+)</span>页', ) page = b.get_page_count() formdata = {} for i in range(1, int(page) + 1): formdata["__EVENTTARGET"] = "navigate$LnkBtnGoto" formdata["navigate$txtNewPageIndex"] = i try: res = requests.post(self.url, headers=self.headers) except Exception as e: print("co_index={},第{}页翻页失败".format(co_index, i)) print(e) continue con = etree.HTML(res.text) formdata["__VIEWSTATE"] = con.xpath( "//input[@id='__VIEWSTATE']/@value")[0] formdata["__EVENTVALIDATION"] = con.xpath( "//input[@id='__EVENTVALIDATION']/@value")[0] bu_url_list = con.xpath("//td[@style='width:13%']/a/@href") bu_pre = con.xpath("//td[@style='width:13%']/a/text()") bu_dev = con.xpath("//td[@style='width:24%']/text()") co_name = con.xpath("//td[@style='width:15%']/text()") # print(i) for index in range(len(bu_url_list)): bu_detail = "http://www.hcsfcglj.com/Templets/BoZhou/aspx/" + bu_url_list[ index] bu_pre_sale = bu_pre[index] bo_develops = bu_dev[index] bu_co_name = co_name[index] try: bu_res = requests.get(bu_detail, headers=self.headers) except Exception as e: print("co_index={},楼栋{}无法访问".format(co_index, bu_detail)) print(e) continue bu_con = bu_res.text self.get_build_info(bu_pre_sale, bo_develops, bu_co_name, bu_con) self.get_house_info(bu_con)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='下页</a>.*?page=(.*?)"', ) page = b.get_page_count() for i in range(1, int(page) + 1): try: all_page_url = url + '?page=' + str(i) response = requests.get(all_page_url, headers=self.headers) html = response.text comm_url_list = re.findall('项目名称:.*?href="(.*?)"', html, re.S | re.M) self.get_comm_info(comm_url_list) except Exception as e: print('page页面错误,co_index={},url={}'.format(co_index, all_page_url), e)
def start_crawler(self): b = AllListUrl( first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='cite>.*?/(.*?)页<', ) page = b.get_page_count() for i in range(1, int(page) + 1): url = self.start_url + '&page=' + str(i) res = requests.get(url, headers=self.headers) comm_url_list = re.findall("window.open\('(.*?)'\)", res.text, re.S | re.M) self.comm_info(comm_url_list)
def start_crawler(self): for region in self.region_list: region_url = self.start_url + region b = AllListUrl( first_page_url=region_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='1/(\d+)页', ) page = b.get_page_count() for i in range(1, int(page) + 1): url = region_url + "&pagenumber=" + str(i) res = requests.get(url, headers=self.headers) html = etree.HTML(res.text) url_list = html.xpath("//tr/td/a/@href") self.comm_parse(url_list, region)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='> ..<.*?>(.*?)<', headers=self.headers) page = b.get_page_count() for i in range(1, int(page) + 1): page_all_url = 'http://www.dzfgj.com/index.php?m=content&c=index&a=lists&catid=61&page=' + str( i) response = requests.get(page_all_url, headers=self.headers) html = response.text comm_html = re.search('<tbody>.*?</tbody>', html, re.S | re.M).group() comm_info_list = re.findall('<tr>.*?</tr>', comm_html, re.S | re.M) self.get_comm_info(comm_info_list)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='> ..<a.*?>(.*?)<', headers=self.headers) page = b.get_page_count() for i in range(1, int(page) + 1): page_url = "http://zfbzj.baotou.gov.cn/index.php?m=content&c=permit&a=init&page=" + str( i) response = requests.get(page_url, headers=self.headers) html = response.text comm_url_list = re.findall( 'href="(http://zfbzj.baotou\.gov\.cn/index\.php\?m=content&c=permit&a=show&id=.*?)".*?http://zfbzj', html, re.S | re.M) self.get_comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='页次:<b><font color=red>1</font></b>/<b>(.*?)<', headers=self.headers ) page = b.get_page_count() for i in range(1, int(page) + 1): page_url = 'http://123.131.127.13/xy/dzlist.asp?page=' + str(i) time.sleep(5) response = requests.get(page_url, headers=self.headers) html = response.content.decode('gbk') comm_list_html = re.search('项目电子手册列表.*?<table(.*?)</table>', html, re.S | re.M).group(1) comm_html_list = re.findall('<tr>(.*?)</tr>', comm_list_html, re.S | re.M)[1:] self.get_comm_info(comm_html_list)
def start_crawler(self): b = AllListUrl(first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='页数.*?/(\d+)', ) page = b.get_page_count() for i in range(1,int(page)+1): formdata = { 'page':i, 'keytype':1, } res = requests.post(self.start_url,data=formdata,headers=self.headers) html = etree.HTML(res.text) url_list = html.xpath("//h3/a") self.co_parse(url_list)
def start_crawler(self): b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='green1">1/(.*?)<', ) page = b.get_page_count() for i in range(1, int(page) + 1): all_url = 'http://tz.tmsf.com/newhouse/property_searchall.htm' data = {'keytype': 1, 'page': page} response = requests.post(all_url, data=data, headers=self.headers) html = response.text comm_url_list = re.findall( '<div class="build_txt">.*?<a href="(.*?)"', html, re.S | re.M) for i in comm_url_list: self.get_comm_detail(i)
def start_crawler(self): start_url = self.start_url + "searchSpf.jsp?nowPage=1" b = AllListUrl( first_page_url=start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='/(\d+)页', ) page = b.get_page_count() for i in range(1, int(page) + 1): url = self.start_url + "searchSpf.jsp?nowPage=" + str(i) res = requests.get(url, headers=self.headers) html = etree.HTML(res.content.decode()) url_list = html.xpath("//b/a/@href") for comm_temp in url_list: try: comm_url = self.start_url + comm_temp.replace( "./xmxxmainNew", 'xmxx/xmjbxx') com_res = requests.get(comm_url, headers=self.headers) con = com_res.content.decode('gbk') co = Comm(co_index) co.co_id = re.search('Id_xmxq=(.*)', comm_temp).group(1) co.co_name = re.search('3a3a3a">(.*?)</b', con).group(1) co.co_address = re.search('项目地址.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_develops = re.search('开 发 商.*?">(.*?)</td', con, re.S | re.M).group(1) co.co_all_house = re.search('总 套 数.*?<td>(.*?)</td', con, re.S | re.M).group(1) co.co_green = re.search('绿 化 率.*?<td>(.*?)</td', con, re.S | re.M).group(1) co.co_volumetric = re.search('容 积 率.*?<td>(.*?)</td', con, re.S | re.M).group(1) try: co.co_build_size = re.search('建设规模.*?" >(.*?)平', con, re.S | re.M).group(1) except: co.co_build_size = None co.insert_db() except Exception as e: log.error('{}小区错误{}'.format(comm_temp, e)) self.build_parse(co.co_id)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='共(.*?)页', headers=self.headers) page = b.get_page_count() for i in range(1, int(page) + 1): if i is 1: all_page_url = 'http://www.xyfcj.com/html/jplp/index.html' else: all_page_url = 'http://www.xyfcj.com/html/jplp/index_' + str( i) + '.html' response = requests.get(all_page_url, headers=self.headers) html = response.text comm_url_list = re.findall( '<a style="COLOR: #000000" target="_blank" href="(.*?)"', html, re.S | re.M) self.get_comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl(first_page_url=url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='strong>1/(.*?)<', headers=self.headers) page = b.get_page_count() for i in range(1, int(page) + 1): index_url = 'http://www.kmhouse.org/moreHousePriceList.asp?page=' + str( i) try: response = requests.get(url=index_url, headers=self.headers) html = response.content.decode('gbk') comm_url_list = re.findall("cellspacing='3'.*?<a href='(.*?)'", html) self.get_comm_info(comm_url_list) except Exception as e: print('page页错误,co_index={},url={}'.format(co_index, index_url), e)
def start_crawler(self): b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='pg.pageCount = (.*?);', ) page = b.get_page_count() all_url_list = [] for i in range(1, int(page) + 1): all_url = 'http://www.gafdc.cn/newhouse/houselist.aspx?hou=0-0-0-0-0-0-&page=' + str( i) comm_url_list = self.get_comm_url(all_url) all_url_list += comm_url_list # 遍历所有小区url for i in all_url_list: comm_url = 'http://www.gafdc.cn/newhouse/' + str( i.replace('index', 'base')) try: self.get_comm_info(comm_url) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_url), e) all_build_url_list = [] for i in all_url_list: build_url = 'http://www.gafdc.cn/newhouse/' + str( i.replace('index', 'table')) house_url_list = self.get_build_info(build_url) if house_url_list: all_build_url_list += house_url_list else: print('楼栋错误,此小区没有楼栋,co_index={},url={}'.format( co_index, build_url)) all_house_url_list = [] form_data_list = [] for i in all_build_url_list: house_url = 'http://www.gafdc.cn/newhouse/GetBuildTableByAjax.ashx' data = {'itemRecord': i[0], 'houseCode': i[1]} all_house_url_list.append(house_url) form_data_list.append(data) self.get_house_info(form_data_list)
def start_crawler(self): b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='共(.*?)页', ) page = b.get_page_count() for i in range(1, int(page) + 1): all_page_url = url + '&Page=' + str(i) p = ProducerListUrl(page_url=all_page_url, request_type='get', encode='gbk', analyzer_rules_dict=None, current_url_rule="eval\('openBldg\((.*?)\)", analyzer_type='regex', headers=self.headers) comm_url_list = p.get_current_page_url() self.get_build_info(comm_url_list)
def start_crawler(self): b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='下一页.*?page=(.*?)"', ) page = b.get_page_count() for i in range(1, int(page) + 1): all_page_url = 'http://www.sxczfdc.com/pubinfo/More_xm.aspx?page=' + str( i) response = requests.get(all_page_url, headers=self.headers) html = response.text comm_url_list = re.findall( 'style="background-color: .*?(Pub_lpxx.aspx\?DevProjectId=.*?)"', html, re.S | re.M) area_list = re.findall( 'style="background-color: .*?center">(.*?)<', html, re.S | re.M) self.get_comm_info(comm_url_list, area_list)
def start(self): b = AllListUrl(first_page_url=self.url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='共(.*?)页', ) page = b.get_page_count() for i in range(1, int(page) + 1): all_page_url = self.url + '&page=' + str(i) response = requests.get(url=all_page_url, headers=self.headers) html = response.text tree = etree.HTML(html) comm_url_list = tree.xpath('//dt[@class="name"]/a/@href') area_list = tree.xpath('//dl[@class="houseList_n"]/dd[3]/text()') for i in range(len(comm_url_list)): url = 'http://www.fzfgj.cn/' + comm_url_list[i] try: comm = Comm(11) comm.area = area_list[i].replace('所属区域:', '') self.get_comm_info(url, comm) except BaseException as e: print('小区错误,co_index={},url={}'.format(co_index, url), e)
def start_crawler(self): b = AllListUrl( first_page_url=self.url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='1/(\d+)页', ) page = b.get_page_count() formdata = {} for i in range(1, page + 1): res = requests.post(self.url, data=formdata, headers=self.headers) html = etree.HTML(res.text) formdata["__EVENTTARGET"] = "Pager" formdata["__EVENTARGUMENT"] = str(i + 1) formdata["__VIEWSTATEGENERATOR"] = "1D9D200C" formdata["__VIEWSTATE"] = html.xpath( "//input[@id='__VIEWSTATE']/@value")[0] comm_url_list = html.xpath("//h3/a/@href") self.get_comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl( first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='<cite>共.*?/(\d+)页', ) page = b.get_page_count() for i in range(1, int(page) + 1): url = "http://www.f0795.cn/house/index-htm-page-" + str( i) + ".html" p = ProducerListUrl( page_url=url, request_type='get', encode='utf-8', analyzer_rules_dict=None, current_url_rule= "//ul[@class='list']//div[@class='text']/h3/a/@href", analyzer_type='xpath', headers=self.headers) comm_url_list = p.get_current_page_url() self.get_comm_info(comm_url_list)