time.localtime(time.time())) item = dict() item['avg_price'] = avg_price item['estate'] = estate item['area'] = area item['layout'] = layout item['total_price'] = total_price item['crawl_date'] = crawl_date item['province'] = response.request.meta['province'] item['city'] = response.request.meta['city'] item['district'] = response.request.meta['district'] item['url'] = response.request.url yield item next_page = soup.select('a#PageControl1_hlk_next') if len(next_page) > 0: url = response.nice_join(next_page[0]['href']) request = Request(url=url, priority=2, callback=self.process_page_2) request.meta['province'] = response.request.meta['province'] request.meta['city'] = response.request.meta['city'] request.meta['district'] = response.request.meta['district'] yield request if __name__ == '__main__': spider = SpiderCore(Fang_Processor(), test=True).set_pipeline(ConsolePipeline()).start()
time.localtime(time.time())) item = dict() item['avg_price'] = avg_price item['estate'] = estate item['area'] = area item['layout'] = layout item['total_price'] = total_price item['crawl_date'] = crawl_date item['province'] = response.request.meta['province'] item['city'] = response.request.meta['city'] item['district'] = response.request.meta['district'] item['url'] = response.request.url yield item next_page = soup.select('a#PageControl1_hlk_next') if len(next_page) > 0: url = response.nice_join(next_page[0]['href']) request = Request(url=url, priority=2, callback=self.process_page_2) request.meta['province'] = response.request.meta['province'] request.meta['city'] = response.request.meta['city'] request.meta['district'] = response.request.meta['district'] yield request if __name__ == '__main__': spider = RequestSpider(Fang_Processor()).set_pipeline( ConsolePipeline()).set_pipeline(TextPipelineFang()).start()
#!/usr/bin/env python # -*- coding: utf-8 -*- # import sys # import os # # sys.path.append(os.path.dirname(os.getcwd())) from car_processor import Car_Processor from fang_processor import Fang_Processor from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline from sasila.system_normal.spider.request_spider import RequestSpider from sasila.system_normal.manager import manager import sasila spider_car = RequestSpider(Car_Processor()).set_pipeline(ConsolePipeline()) spider_fang = RequestSpider(Fang_Processor()).set_pipeline(ConsolePipeline()) manager.set_spider(spider_car) manager.set_spider(spider_fang) sasila.start()
callback=self.process_detail) request.meta['title'] = title request.meta['shortDes'] = shortDes request.meta['img_name'] = img_name yield request @checkResponse def process_pic(self, response): result = response.m_response.content yield pipeItem(['save'], result) @checkResponse def process_detail(self, response): soup = bs(response.m_response.content, 'lxml') dd_tail = soup.select('div.zxxwleft p.zxxw2')[0].text.replace( '来源: ', '').replace('来源:', '').split(' ') date_time = dd_tail[1].strip() + ' ' + dd_tail[2].strip().replace( '|', '') newsFrom = dd_tail[0].strip() result = dict() result['date_time'] = date_time result['newsFrom'] = newsFrom yield pipeItem(['console', 'test'], result) if __name__ == '__main__': SpiderCore(TEST_Processor()).set_pipeline(ConsolePipeline(),'console')\ .set_pipeline(PicPipeline(),'save').set_pipeline(TestPipeline(),'test').start()
detail_list = soup.select('div.details li') if len(detail_list) == 0: soup = bs(response.m_response.content, 'html5lib') detail_list = soup.select('div.details li') mileage = detail_list[0].select('span')[0].text.replace('万公里', '') first_borad_date = detail_list[1].select('span')[0].text gear = detail_list[2].select('span')[0].text.split('/')[0] displacement = detail_list[2].select('span')[0].text.split('/')[1] price = soup.select('div.car-price ins')[0].text.replace('¥', '') crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) item = dict() item['car'] = car item['mileage'] = mileage item['first_borad_date'] = first_borad_date item['gear'] = gear item['displacement'] = displacement item['price'] = price item['crawl_date'] = crawl_date item['province'] = response.request.meta['province'] item['city'] = response.request.meta['city'] item['brand'] = response.request.meta['brand'] item['cars_line'] = response.request.meta['cars_line'] yield item if __name__ == '__main__': spider = RequestSpider(Car_Processor()).set_pipeline( ConsolePipeline()).set_pipeline(TextPipelineCar()).start()
result_item["company_man"] = content.select( "td")[1].text.split('\n')[1].strip().replace("企业法人:", "") result_item["company_telephone"] = content.select( "td")[1].text.split('\n')[2].strip().replace("联系方式:", "") result_item["company_address"] = content.select( "td")[1].text.split('\n')[3].strip() if "地址:" in result_item["company_address"]: result_item["company_address"] = result_item[ "company_address"].replace("地址:", "") else: result_item["company_address"] = "" result_item["company_registered_capital"] = content.select( "td")[2].text.strip() result_item["company_registered_time"] = content.select( "td")[3].text.strip() result_item["company_status"] = content.select( "td")[4].text.strip() result_item["source"] = "企查查" result_item["update_time"] = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) yield result_item except Exception: print traceback.format_exc() qcc_spider = RequestSpider(QccProcessor(), time_sleep=1).set_pipeline( KafkaPipeline()).set_pipeline(TextPipeline()).set_pipeline( ConsolePipeline()) if __name__ == '__main__': qcc_spider.start()
result_mobile = result.find( lambda tag: tag.name == 'p' and '电话:' in tag.text).text m_result = dict() m_result['result_name'] = result_name m_result['result_mobile'] = result_mobile.replace('电话:', '') m_result['city_name'] = response.request.meta['city_name'] m_result['category1_name'] = response.request.meta[ 'category1_name'] m_result['category2_name'] = response.request.meta['city_name'] yield m_result next_page = soup.find( lambda tag: tag.name == 'a' and '下一页' in tag.text) if next_page: url_splits = response.request.url.split('/') url_splits[-1] = next_page['href'] url = '/'.join(url_splits) request = Request(url=url, priority=1, callback=self.process_page_1) request.meta['city_name'] = response.request.meta['city_name'] request.meta['category1_name'] = response.request.meta[ 'category1_name'] request.meta['category2_name'] = response.request.meta[ 'category2_name'] yield request if __name__ == '__main__': SpiderCore(Bendibao_Processor(), time_sleep=0.5).set_pipeline( TextPipelineBendibao()).set_pipeline(ConsolePipeline()).start()
spider_id = 'test' spider_name = 'test' allowed_domains = ['mzitu.com'] start_requests = [Request(url="http://www.mzitu.com/")] def process(self, response): soup = bs(response.m_response.content, 'lxml') a_list = soup.select("a") for a in a_list: if "href" in a.attrs: url = response.nice_join(a["href"]) if response.is_url(url): yield Request(url=url, callback=self.procces2) def procces2(self, response): if response.m_response: soup = bs(response.m_response.content, 'lxml') yield soup.title a_list = soup.select("a") for a in a_list: if "href" in a.attrs: url = response.nice_join(a["href"]) if response.is_url(url): yield Request(url=url, callback=self.procces2) else: print response.request.url if __name__ == '__main__': spider = RequestSpider(FirstProcessor()).set_pipeline(ConsolePipeline()).start()
def test_car_processor(self): test_pipeline = TestPipeline() RequestSpider(Car_Processor(), test=True).set_pipeline(ConsolePipeline()).set_pipeline(test_pipeline).start() self.assertEqual(test_pipeline.result['province'], '上海', '爬取结果,省份为上海')
#!/usr/bin/env python # -*- coding: utf-8 -*- # import sys # import os # # sys.path.append(os.path.dirname(os.getcwd())) from car_processor import Car_Processor from fang_processor import Fang_Processor from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline from sasila.system_normal.spider.spider_core import SpiderCore from sasila.system_normal.manager import manager from sasila import system_web spider_car = SpiderCore(Car_Processor()).set_pipeline(ConsolePipeline()) spider_fang = SpiderCore(Fang_Processor()).set_pipeline(ConsolePipeline()) manager.set_spider(spider_car) manager.set_spider(spider_fang) system_web.start()
detail_list = soup.select('div.details li') if len(detail_list) == 0: soup = bs(response.m_response.content, 'html5lib') detail_list = soup.select('div.details li') mileage = detail_list[0].select('span')[0].text.replace('万公里', '') first_borad_date = detail_list[1].select('span')[0].text gear = detail_list[2].select('span')[0].text.split('/')[0] displacement = detail_list[2].select('span')[0].text.split('/')[1] price = soup.select('div.car-price ins')[0].text.replace('¥', '') crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) item = dict() item['car'] = car item['mileage'] = mileage item['first_borad_date'] = first_borad_date item['gear'] = gear item['displacement'] = displacement item['price'] = price item['crawl_date'] = crawl_date item['province'] = response.request.meta['province'] item['city'] = response.request.meta['city'] item['brand'] = response.request.meta['brand'] item['cars_line'] = response.request.meta['cars_line'] yield item if __name__ == '__main__': SpiderCore(Car_Processor()).set_pipeline(ConsolePipeline()).set_pipeline( TextPipelineCar()).start()
def test_car_processor(self): test_pipeline = TestPipeline() SpiderCore(Car_Processor(), test=True).set_pipeline( ConsolePipeline()).set_pipeline(test_pipeline).start() self.assertEqual(len(test_pipeline.result), 11, '爬取结果,11个字段')
def test_car_processor(self): test_pipeline = TestPipeline() SpiderCore(TEST_Processor(),test=True).set_pipeline(ConsolePipeline(),'console').set_pipeline(PicPipeline(),'save')\ .set_pipeline(test_pipeline,'test').start() self.assertIn('2017',test_pipeline.result['date_time'])
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import os sys.path.append(os.getcwd()) from car_processor import Car_Processor from fang_processor import Fang_Processor from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline from sasila.system_normal.spider.spider_core import SpiderCore from sasila.system_normal.manager import manager from sasila import system_web if __name__ == '__main__': spider_car = SpiderCore(Car_Processor(), batch_size=100).set_pipeline(ConsolePipeline()) spider_fang = SpiderCore(Fang_Processor()).set_pipeline(ConsolePipeline()) manager.set_spider(spider_car) manager.set_spider(spider_fang) system_web.start()