def start_requests(self): for url in self.start_urls: if 'chengjiao' in url: yield Request(url, callback=self._parse_sold) elif 'ershoufang' in url: yield Request(url, callback=self._parse_sale) else: print(url + '出错!!')
def _reload_sold(self, response, sold_houses): if response.request.meta.get('download_times'): download_times = response.request.meta['download_times'] logger.error(*self.lfm.crawled( 'Spider', self.name, '({0})再次下载,时间为:'.format( response.request.headers.getRawHeaders('User-Agent')[0]), { 'function': '第{0}次'.format(download_times), 'request': response.request, 'time': time.clock(), })) download_times = download_times + 1 else: download_times = 1 if download_times < 4: return Request(response.url, callback=self._parse_sold, meta={ 'download_times': download_times, 'header_flag': True, 'last_header': response.request.headers }) else: logger.error(*self.lfm.crawled( 'Spider', self.name, '重复下载次数已超过最大值,判断此网页没有数据,时间为:', { 'function': '第{0}次'.format(download_times), 'request': response.request, 'time': time.clock(), })) return None
def start_requests(self): self.start_urls = [ 'http://www.cffex.com.cn/sj/ccpm/201810/08/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/09/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/10/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/11/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/12/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/15/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/16/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/17/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/18/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/19/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/22/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/23/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/24/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/25/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/26/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/29/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/30/IH.xml', 'http://www.cffex.com.cn/sj/ccpm/201810/31/IH.xml' ] for url in self.start_urls: yield Request(url, callback=self._parse)
def start_requests(self): self.start_urls = [ #"https://sh.lianjia.com/xiaoqu/biyun/", "https://sh.lianjia.com/xiaoqu/caolu/" ] for url in self.start_urls: yield Request(url, callback=self._parse)
def start_requests(self): start_url = list() for i in range(20, 30): i = str(i) u = self.url + i start_url.append(u) for url in start_url: yield Request(url, callback=self._parse)
def start_requests(self): start_url = list() for i in range(1, self._maxnum + 1): if i == 1: url = self._url else: i = str(i) url = self._url + "pg" + i start_url.append(url) for url in start_url: yield Request(url, callback=self._parse, headers=self.headers)
def _parse(self,response): seletor = etree.HTML(response.body) # 分区的总小区数 total_number = seletor.xpath("/html/body/div[4]/div[1]/div[2]/h2/span/text()")[0] self.total_number_community = total_number # 获取分区下属城镇的地址 part_zone = seletor.xpath("/html/body/div[3]/div[1]/dl[2]/dd/div/div[2]/a") for a in part_zone: path = a.get('href') name = path.split('/')[-2] new_url = urljoin(self.base_url, path) yield Request(new_url,callback=self._parse2,meta={"zone_name":name})
def start_request_03(): start_url = list() url = 'https://www.smzdm.com/homepage/json_more?p=' for i in range(20, 30): i = str(i) u = url + i start_url.append(u) for url in start_url: #print(url) yield Request(url)
def start_requests(self): self.start_urls = [ 'https://sh.lianjia.com/xiaoqu/anshan/', # 157 156 'https://sh.lianjia.com/xiaoqu/dongwaitan/', # 144 141 'https://sh.lianjia.com/xiaoqu/huangxinggongyuan/', # 159 159 'https://sh.lianjia.com/xiaoqu/kongjianglu/', 'https://sh.lianjia.com/xiaoqu/wujiaochang/', 'https://sh.lianjia.com/xiaoqu/xinjiangwancheng/', 'https://sh.lianjia.com/xiaoqu/zhoujiazuilu/', 'https://sh.lianjia.com/xiaoqu/zhongyuan1/' ] for url in self.start_urls: yield Request(url, callback=self._parse)
def _parse(self, response): seletor = etree.HTML(response.body) # 获取下属城镇的小区总页数 page_number = seletor.xpath( "//div[@class='page-box house-lst-page-box']/@page-data") self.total_page_number = json.loads(page_number[0])["totalPage"] total_xiaoqu_number = seletor.xpath( "/html/body/div[4]/div[1]/div[2]/h2/span/text()")[0] logger.debug("%s的总页数是%d" % (self.name, self.total_page_number)) self.result["total_xiaoqu_number"] = [total_xiaoqu_number] for i in range(1, self.total_page_number + 1): url = response.requset.url + 'pg' + str(i) yield Request(url, callback=self._parse2, meta={"page_num": i})
def _parse_sold(self, response): selector = etree.HTML(response.body) try: sold_houses = self._xpath_filter( selector.xpath("//ul[@class='listContent']")).xpath('./li') total_num = selector.xpath( '//div[@class="total fl"]/span/text()')[0] if int(total_num) == 0: return self._reload_sold(response, sold_houses) else: self._resolve_sold(sold_houses, response.url) if int(total_num) > len(sold_houses): if not re.search('pg', response.url): print("sold:" + self.name + ': ' + response.url + ': ' + str(total_num) + "===" + str(len(sold_houses))) page_number = selector.xpath( "//div[@class='page-box house-lst-page-box']/@page-data" ) total_page_number = json.loads( page_number[0])["totalPage"] base_name = response.url.split('/')[-2] for pg in range(2, total_page_number + 1): url = response.url.replace( base_name, 'pg' + str(pg) + base_name) yield Request(url, callback=self._parse_sold) else: pg = re.findall('pg\d+', response.url)[0] print("sold:" + self.name + '_' + pg + ': ' + response.url + ': ' + str(total_num) + "===" + str(len(sold_houses))) return None except Exception as e: logger.error(*self.lfm.error( 'Spider', self.name, '解析房子信息时出现错误', { 'request': response.request, 'function': 'total_num={0} sold_houses={1}'.format( int(total_num), len(sold_houses)) }), extra={ 'exception': e, 'time': time.clock() }) return None
def _parse(self, response): seletor = etree.HTML(response.body) # 获取所有分区的名称和url all_zone = seletor.xpath("/html/body/div[3]/div[1]/dl[2]/dd/div/div/a") for one_zone in all_zone: # 获取一个分区的url path = one_zone.get('href') # 过滤不需要爬的小区 if path not in ["/xiaoqu/chongming/", "/xiaoqu/shanghaizhoubian/"]: name = path.split('/')[-2] new_url = urljoin(self.base_url, path) self.all_zones[name] = new_url yield Request(new_url, callback=self._parse2, meta={"total_zone_name": name})
def _parse_getAllCommunity(self, response): seletor = etree.HTML(response.body) # 获取下属城镇的小区总页数 page_number = seletor.xpath( "//div[@class='page-box house-lst-page-box']/@page-data") self.total_page_number = json.loads(page_number[0])["totalPage"] total_xiaoqu_number = seletor.xpath( "/html/body/div[4]/div[1]/div[2]/h2/span/text()")[0] self.result["total_xiaoqu_number"] = [total_xiaoqu_number] # logger.critical("%s的总页数是%d" % (self.name, self.total_page_number)) for i in range(1, self.total_page_number + 1): url = self._start_urls[0] + '/pg' + str(i) yield Request(url, callback=self._parse_getCommunityInfo, meta={"page_num": i})
def start_requests(self): start_url = list() for i in range(1, self._maxnum): if i == 1: url = self._url else: i = str(i) url = self._url + "pg" + i start_url.append(url) for url in start_url: yield Request( url, callback=self._parse, headers=self.headers, #meta={"download_redirect":True} )
def _parse(self, response): #web_body = BeautifulSoup(response.body,"html.parser") seletor = etree.HTML(response.body) total_zone = seletor.xpath( "/html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div/a") total_urls = {} for a in total_zone: path = a.get('href') if path not in [ "/ershoufang/chongming/", "/ershoufang/shanghaizhoubian/", "/ershoufang/jinshan/" ]: name = path.split("/")[-2] #print(name) new_url = urljoin(self.base_url, path) #print(new_url) total_urls[name] = new_url for name, url in total_urls.items(): yield Request(url, callback=self._parse2, headers=self.headers, meta={'part_name': name})
def start_requests(self): #for url in self.start_urls: yield Request(self.start_urls,callback=self._parse)
def start_requests(self): self.start_urls = [ 'http://www.shfe.com.cn/data/dailydata/kx/pm20190102.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190103.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190104.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190107.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190108.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190109.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190110.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190111.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190114.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190115.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190116.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190117.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190118.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190121.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190122.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190123.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190124.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190125.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190128.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190129.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190130.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190131.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190201.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190211.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190212.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190213.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190214.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190215.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190218.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190219.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190220.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190221.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190222.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190225.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190226.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190227.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190228.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190301.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190304.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190305.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190306.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190307.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190308.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190311.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190312.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190313.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190314.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190315.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190318.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190319.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190320.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190321.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190322.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190325.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190326.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190327.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190328.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190329.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190401.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190402.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190403.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190404.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190408.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190409.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190410.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190411.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190412.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190415.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190416.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190417.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190418.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190419.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190422.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190423.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190424.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190425.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190426.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190429.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190430.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190506.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190507.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190508.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190509.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190510.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190513.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190514.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190515.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190516.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190517.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190520.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190521.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190522.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190523.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190524.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190527.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190528.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190529.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190530.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190531.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190603.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190604.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190605.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190606.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190610.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190611.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190612.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190613.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190614.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190617.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190618.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190619.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190620.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190621.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190624.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190625.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190626.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190627.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190628.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190701.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190702.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190703.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190704.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190705.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190708.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190709.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190710.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190711.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190712.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190715.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190716.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190717.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190718.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190719.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190722.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190723.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190724.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190725.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190726.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190729.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190730.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190731.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190801.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190802.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190805.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190806.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190807.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190808.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190809.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190812.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190813.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190814.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190815.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190816.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190819.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190820.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190821.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190822.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190823.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190826.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190827.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190828.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190829.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190830.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190902.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190903.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190904.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190905.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190906.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190909.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190910.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190911.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190912.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190916.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190917.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190918.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190919.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190920.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190923.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190924.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190925.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190926.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190927.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20190930.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191008.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191009.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191010.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191011.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191014.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191015.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191016.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191017.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191018.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191021.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191022.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191023.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191024.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191025.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191028.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191029.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191030.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191031.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191101.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191104.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191105.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191106.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191107.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191108.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191111.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191112.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191113.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191114.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191115.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191118.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191119.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191120.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191121.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191122.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191125.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191126.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191127.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191128.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191129.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191202.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191203.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191204.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191205.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191206.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191209.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191210.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191211.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191212.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191213.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191216.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191217.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191218.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191219.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191220.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191223.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191224.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191225.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191226.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191227.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191230.dat', 'http://www.shfe.com.cn/data/dailydata/kx/pm20191231.dat' ] # self.start_urls = [ # 'http://www.shfe.com.cn/data/dailydata/kx/pm20191008.dat', # 'http://www.shfe.com.cn/data/dailydata/kx/pm20191009.dat', # 'http://www.shfe.com.cn/data/dailydata/kx/pm20191010.dat', # 'http://www.shfe.com.cn/data/dailydata/kx/pm20191011.dat', # 'http://www.shfe.com.cn/data/dailydata/kx/pm20191012.dat', # # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181015.dat', # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181016.dat', # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181017.dat', # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181018.dat', # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181019.dat', # # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181022.dat', # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181023.dat', # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181024.dat', # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181025.dat', # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181026.dat', # # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181029.dat', # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181030.dat', # 'http://www.shfe.com.cn/data/dailydata/kx/pm20181031.dat' # ] for url in self.start_urls: yield Request(url, callback=self._parse)
def start_requests(self): yield Request(self.start_url, callback=self._parse)
def start_requests(self): for url in self.start_urls: yield Request(url, callback=self._parse_getAllCommunity)
def start_requests(self): yield Request(self.start_urls[0], callback=self._parse_sold)
def _parse_sold(self, response): selector = etree.HTML(response.body) try: sold_houses = self._xpath_filter( selector.xpath("//ul[@class='listContent']")).xpath('./li') total_num = selector.xpath( '//div[@class="total fl"]/span/text()')[0] print("sold:" + self.name + ': ' + response.url + ': ' + str(total_num) + "===" + str(len(sold_houses))) # page_number = selector.xpath("//div[@class='page-box house-lst-page-box']/@page-data") # if page_number: # total_page_number = json.loads(page_number[0])["totalPage"] # print(total_page_number) # else: # print(page_number) if int(total_num) == 0: return self._reload_sold(response, sold_houses) else: self._resolve_sold(sold_houses, response.url) if int(total_num) > len(sold_houses): # self.result_items += len(sold_houses) if not re.search('pg', response.url): print("sold:" + self.name + ': ' + response.url + ': ' + str(total_num) + "===" + str(len(sold_houses))) page_number = selector.xpath( "//div[@class='page-box house-lst-page-box']/@page-data" ) if page_number: total_page_number = json.loads( page_number[0])["totalPage"] base_name = response.url.split('/')[-2] for pg in range(2, total_page_number + 1): url = response.url.replace( base_name, 'pg' + str(pg) + base_name) yield Request(url, callback=self._parse_sold) else: return self._reload_sold(response, sold_houses) else: pg = re.findall('pg\d+', response.url)[0] print("sold:" + self.name + '_' + pg + ': ' + response.url + ': ' + str(total_num) + "===" + str(len(sold_houses))) except Exception as e: logger.error(*self.lfm.error( 'Spider', self.name, '解析房子信息时出现错误:', { 'request': response.request, 'function': 'total_num={0} sold_houses={1}'.format( int(total_num), len(sold_houses)), 'exception': e }), extra={'time': ',时间为:%6.3f' % time.clock()}) if len(self.serect_price) > 0: while (len(self.serect_price) != 0): item = self.serect_price.popitem() url = item[1]['sold_house_url'] yield Request(url, callback=self._get_secret_price, meta={'title': item[0]}) return None
def request_errback(content): print("request_and_response errback") print(content) return content def agent_print(content): print("agent_print") print(type(content)) print(content) request = Request(url=url, callback=request_callback, method='get', headers=headers, errback=request_errback, meta={"download_timeout": 2}) settings = Setting() spider = Spider1.update_settings(settings) httphandler = HTTPDownloadHandler(settings) agent = httphandler.download_request(request, spider) agent.addCallback(agent_print) agent.addErrback(request_errback) agent.addBoth(lambda _: reactor.stop()) reactor.run()
def _parse_sold(self, response): seletor = BeautifulSoup(response.body, "html.parser") try: base_xpath = './div[@class="info"]' total_num = seletor.find('div', class_="total fl").span.text sold_houses = seletor.find('ul', class_='listContent') if int(total_num) == 0: if response.request.meta.get('download_times'): download_times = response.request.meta['download_times'] logger.warning(*self.lfm.crawled( 'Spider', self.name, '{{0}}再次下载,时间为:'.format( response.request.headers.getRawHeaders( 'User-Agent')[0]), { 'function': '第{0}次'.format(download_times), 'request': response.request, 'time': time.clock(), })) download_times = download_times + 1 else: download_times = 1 if download_times < 4: return Request(response.url, callback=self._parse_sold, meta={ 'download_times': download_times, 'header_flag': True, 'last_header': response.request.headers }) else: logger.warning(*self.lfm.crawled( 'Spider', self.name, '重复下载次数已超过最大值,判断此网页没有数据', { 'function': '第{0}次'.format(download_times), 'request': response.request, 'time': time.clock(), })) print("sold:" + self.name + ': ' + response.url + ': ' + str(total_num) + "===" + str(len(sold_houses))) return None # sold_houses = self._xpath_filter(seletor.xpath("//ul[@class='listContent']")).xpath('./li') # total_num = seletor.xpath('//div[@class="total fl"]/span/text()') # total_num = seletor.xpath("/html/body/div[5]/div[1]/div[2]/div[1]/span/text()")[0] # # # for sold_house in sold_houses: # sold_title = \ # self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="title"]/a/text()')) # print("小区名称:"+sold_title) # # sold_address = \ # self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="address"]/div[@class="houseInfo"]/text()')) # print("小区地址:"+sold_address) # # sold_dealDate = \ # self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="address"]/div[@class="dealDate"]/text()')) # print("成交日期:"+sold_dealDate) # # sold_totalPrice = \ # self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="address"]/div[@class="totalPrice"]/span/text()')) # print("成交价格:"+sold_totalPrice) # # sold_unitPrice = \ # self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="flood"]/div[@class="unitPrice"]/span/text()')) # print('成交均价:'+sold_unitPrice) # # sold_positionInfo = \ # self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="flood"]/div[@class="positionInfo"]/text()')) # print("楼层高度:"+sold_positionInfo) # # sold_saleonborad = \ # self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="dealCycleeInfo"]/span[@class="dealCycleTxt"]/span[1]/text()')) # # print("挂牌价:"+sold_saleonborad) # # sold_dealcycle = \ # self._xpath_filter(sold_house.xpath(base_xpath + '/div[@class="dealCycleeInfo"]/span[@class="dealCycleTxt"]/span[2]/text()')) # print("成交周期:"+sold_dealcycle) except Exception as e: print(e) # raise Exception(e) return None