from sasila.system_normal.pipeline.text_pipeline import TextPipelineBendibao from sasila.system_normal.processor.base_processor import BaseProcessor from sasila.system_normal.downloader.http.spider_request import Request from sasila.system_normal.utils.decorator import checkResponse if sys.version_info < (3, 0): reload(sys) sys.setdefaultencoding('utf-8') start_requests_temp = [] with open(name='city.txt', mode='r') as fs: lines = fs.readlines() for line in lines: request_temp = Request(url=line.strip().split(',')[0] + 'wangdian/', priority=0) request_temp.meta["city_name"] = line.strip().split(',')[1] start_requests_temp.append(request_temp) class Bendibao_Processor(BaseProcessor): spider_id = 'bendibao_spider' spider_name = 'bendibao_spider' allowed_domains = ['bendibao.com'] start_requests = start_requests_temp @checkResponse def process(self, response): soup = bs(response.m_response.content, 'lxml') category1 = soup.select('div.navlink') for category in category1:
class Fang_Processor(BaseProcessor): spider_id = 'fang_spider' spider_name = 'fang_spider' allowed_domains = ['fang.com'] start_requests = [ Request(url='http://esf.gz.fang.com/newsecond/esfcities.aspx', priority=0) ] @checkResponse def process(self, response): soup = bs(response.m_response.content, 'lxml') province_list = {u'四川', u'江苏', u'江西', u'山东', u'广东', u'山西'} province_div_list = soup.select('div#c02 ul li') for province_div in province_div_list: province_name = province_div.select('strong')[0].text if province_name != '其他': if province_name in province_list: city_list = province_div.select('a') for city in city_list: city_name = city.text url = city['href'] request = Request(url=url, priority=1, callback=self.process_page_1) request.meta['province'] = province_name request.meta['city'] = city_name yield request @checkResponse def process_page_1(self, response): soup = bs(response.m_response.content, 'lxml') district_list = soup.select('div.qxName a') district_list.pop(0) for district in district_list: district_name = district.text url = response.request.url + district['href'] request = Request(url=url, priority=2, callback=self.process_page_2) request.meta['province'] = response.request.meta['province'] request.meta['city'] = response.request.meta['city'] request.meta['district'] = district_name yield request @checkResponse def process_page_2(self, response): soup = bs(response.m_response.content, 'lxml') avg_price_list = soup.select('div.newcardR dl') if len(avg_price_list) > 0: avg_price = avg_price_list[1].select('dd b')[0].text else: avg_price = '未知' detail_list = soup.select('div.houseList dl') for detail in detail_list: if len(detail.select('p.mt10 a span')) != 0: estate = detail.select('p.mt10 a span')[0].text area = detail.select('div.area p')[0].text.replace('㎡', '') layout = detail.select('p.mt12')[0].text.split('|')[0].strip() total_price = detail.select( 'div.moreInfo p.mt5 span.price')[0].text crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) item = dict() item['avg_price'] = avg_price item['estate'] = estate item['area'] = area item['layout'] = layout item['total_price'] = total_price item['crawl_date'] = crawl_date item['province'] = response.request.meta['province'] item['city'] = response.request.meta['city'] item['district'] = response.request.meta['district'] item['url'] = response.request.url yield item next_page = soup.select('a#PageControl1_hlk_next') if len(next_page) > 0: url = response.nice_join(next_page[0]['href']) request = Request(url=url, priority=2, callback=self.process_page_2) request.meta['province'] = response.request.meta['province'] request.meta['city'] = response.request.meta['city'] request.meta['district'] = response.request.meta['district'] yield request
def get_all_page(self, response): if not response.m_response: logger.error(response.request.url) yield response.request if '<script>window.location.href=' in response.m_response.content: logger.error(response.m_response.content + "\n" + response.request.url) yield response.request else: soup = bs(response.m_response.content, "lxml") try: temp_page = soup.find(lambda tag: tag.name == 'a' and '>' == tag.text).parent.findNextSibling() if temp_page: page = temp_page.select_one("a") if page: total_page = int(page.string.strip().replace( "...", "")) else: total_page = 1 else: temp_page = soup.find( lambda tag: tag.name == 'a' and '>' == tag.text ).parent.findPreviousSibling() if temp_page: page = temp_page.select_one("a") if page: total_page = int(page.string.strip().replace( "...", "")) else: total_page = 1 else: total_page = 1 except: total_page = 1 now_page = 1 while now_page <= total_page: if response.request.meta["city_id"] == "": request = Request( url= "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" + response.request.meta["province_id"] + "&p=" + str(now_page) + "&", callback="get_content", priority=2) request.meta["city_name"] = response.request.meta[ "city_name"] request.meta["city_id"] = response.request.meta["city_id"] request.meta["province_name"] = response.request.meta[ "province_name"] request.meta["province_id"] = response.request.meta[ "province_id"] yield request else: request = Request( url= "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&p=" + str(now_page) + "&province=" + response.request.meta["province_id"] + "&city=" + response.request.meta["city_id"] + "&", callback="get_content", priority=2) request.meta["city_name"] = response.request.meta[ "city_name"] request.meta["city_id"] = response.request.meta["city_id"] request.meta["province_name"] = response.request.meta[ "province_name"] request.meta["province_id"] = response.request.meta[ "province_id"] yield request now_page += 1
class Fang_Shop_Processor(BaseProcessor): spider_id = 'fang_shop_spider' spider_name = 'fang_shop_spider' allowed_domains = ['fang.com'] start_requests = [Request(url='http://shop.fang.com', priority=0)] @checkResponse def process(self, response): city_crawl_list = {u'成都', u'南京', u'苏州', u'无锡', u'南昌', u'济南', u'青岛', u'广州', u'东莞'} soup = bs('''<a href="http://shop1.fang.com/" style="width:40px;padding:4px 0 4px 8px;">北京</a> <a href="http://shop.sh.fang.com/" style="width:40px;padding:4px 0 4px 8px;">上海</a> <a href="http://shop.gz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">广州</a> <a href="http://shop.sz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">深圳</a> <a href="http://shop.tj.fang.com/" style="width:40px;padding:4px 0 4px 8px;">天津</a> <a href="http://shop.cq.fang.com/" style="width:40px;padding:4px 0 4px 8px;">重庆</a> <a href="http://shop.cd.fang.com/" style="width:40px;padding:4px 0 4px 8px;">成都</a> <a href="http://shop.suzhou.fang.com/" style="width:40px;padding:4px 0 4px 8px;">苏州</a> <a href="http://shop.wuhan.fang.com/" style="width:40px;padding:4px 0 4px 8px;">武汉</a> <a href="http://shop.xian.fang.com/" style="width:40px;padding:4px 0 4px 8px;">西安</a> <a href="http://shop.dg.fang.com/" style="width:40px;padding:4px 0 4px 8px;">东莞</a> <a href="http://shop.km.fang.com/" style="width:40px;padding:4px 0 4px 8px;">昆明</a> <a href="http://shop.hz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">杭州</a> <a href="http://shop.jn.fang.com/" style="width:40px;padding:4px 0 4px 8px;">济南</a> <a href="http://shop.wuxi.fang.com/" style="width:40px;padding:4px 0 4px 8px;">无锡</a> <a href="http://shop.zz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">郑州</a> <a href="http://shop.nc.fang.com/" style="width:40px;padding:4px 0 4px 8px;">南昌</a> <a href="http://shop.qd.fang.com/" style="width:40px;padding:4px 0 4px 8px;">青岛</a> <a href="http://shop.sjz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">石家庄</a> <a href="http://shop.nanjing.fang.com/" style="width:40px;padding:4px 0 4px 8px;">南京</a> <a href="http://shop.dl.fang.com/" style="width:40px;padding:4px 0 4px 8px;">大连</a>''', 'lxml') city__list = soup.select('a') for city in city__list: city_name = city.text if city_name in city_crawl_list: url = city['href'] request = Request(url=url, priority=1, callback=self.process_page_1) request.meta['city'] = city_name yield request @checkResponse def process_page_1(self, response): soup = bs(response.m_response.content, 'lxml') district_list = soup.select('div.qxName a') district_list.pop(0) for district in district_list: district_name = district.text url = response.request.url + district['href'] request = Request(url=url, priority=2, callback=self.process_page_2) request.meta['city'] = response.request.meta['city'] request.meta['district'] = district_name yield request @checkResponse def process_page_2(self, response): soup = bs(response.m_response.content, 'lxml') detail_list = soup.select('div.houseList dl') for detail in detail_list: estate = detail.select('p.mt15 span.spName')[0].text detail_str = detail.select('p.mt10')[0].text temp_list = detail.select('p.mt10')[0].text.split('/') temp_list = [temp.strip() for temp in temp_list] if '购物中心/百货' not in detail_str and '层' in detail_str: m_type = temp_list[0].replace('类型:', '') floor = temp_list[1] total_floor = temp_list[2].replace('层', '') elif '购物中心/百货' not in detail_str and '层' not in detail_str: m_type = temp_list[0].strip().replace('类型:', '') floor = '未知' total_floor = '未知' elif '购物中心/百货' in detail_str and '层' not in detail_str: m_type = temp_list[0].replace('类型:', '') + temp_list[1] floor = '未知' total_floor = '未知' elif '购物中心/百货' in detail_str and '层' in detail_str: m_type = temp_list[0].replace('类型:', '') + temp_list[1] floor = temp_list[2] total_floor = temp_list[3].replace('层', '') else: logger.error('unexpective detail_str: ' + detail_str.strip()) area = detail.select('div.area')[0].text.replace('㎡', '').replace('建筑面积', '') total_price = detail.select('div.moreInfo p.mt5 span.price')[0].text crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) item = dict() item['estate'] = estate item['floor'] = floor item['total_floor'] = total_floor item['type'] = m_type item['area'] = area item['total_price'] = total_price item['crawl_date'] = crawl_date item['city'] = response.request.meta['city'] item['district'] = response.request.meta['district'] item['url'] = response.request.url yield item next_page = soup.select('a#PageControl1_hlk_next') if len(next_page) > 0: url = response.nice_join(next_page[0]['href']) + '/' request = Request(url=url, priority=2, callback=self.process_page_2) request.meta['city'] = response.request.meta['city'] request.meta['district'] = response.request.meta['district'] yield request
class QccProcessor(BaseProcessor): spider_id = 'qcc' spider_name = 'qcc' allowed_domains = ['qichacha.com'] start_requests = [ Request( url= 'http://www.qichacha.com/search?key=%E5%B0%8F%E9%A2%9D%E8%B4%B7%E6%AC%BE' ) ] def process(self, response): if not response.m_response: logger.error(response.request.url) yield response.request if '<script>window.location.href=' in response.m_response.content: logger.error(response.m_response.content + "\n" + response.request.url) yield response.request soup = bs(response.m_response.content, "lxml") province_list = soup.select_one("dl#provinceOld").select( "div.pull-left")[1].select("dd a") for province in province_list: province_name = province.string.strip() province_id = province["data-value"].strip() request = Request( url="http://www.qichacha.com/search_getCityListHtml?province=" + province_id + "&q_type=1", callback="get_city", priority=0) request.meta["province_name"] = province_name request.meta["province_id"] = province_id yield request def get_city(self, response): if not response.m_response: logger.error(response.request.url) yield response.request if '<script>window.location.href=' in response.m_response.content: logger.error(response.m_response.content + "\n" + response.request.url) yield response.request if response.m_response.content == "": request = Request( url= "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" + response.request.meta["province_id"] + "&", callback="get_all_page", priority=1) request.meta["city_name"] = "" request.meta["city_id"] = "" request.meta["province_name"] = response.request.meta[ "province_name"] request.meta["province_id"] = response.request.meta["province_id"] yield request else: soup = bs(response.m_response.content, "lxml") city_list = soup.select("a") for city in city_list: city_name = city.string.strip() city_id = city["data-value"].strip() request = Request( url= "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" + response.request.meta["province_id"] + "&city=" + city_id + "&", callback="get_all_page", priority=1) request.meta["city_name"] = city_name request.meta["city_id"] = city_id request.meta["province_name"] = response.request.meta[ "province_name"] request.meta["province_id"] = response.request.meta[ "province_id"] yield request def get_all_page(self, response): if not response.m_response: logger.error(response.request.url) yield response.request if '<script>window.location.href=' in response.m_response.content: logger.error(response.m_response.content + "\n" + response.request.url) yield response.request else: soup = bs(response.m_response.content, "lxml") try: temp_page = soup.find(lambda tag: tag.name == 'a' and '>' == tag.text).parent.findNextSibling() if temp_page: page = temp_page.select_one("a") if page: total_page = int(page.string.strip().replace( "...", "")) else: total_page = 1 else: temp_page = soup.find( lambda tag: tag.name == 'a' and '>' == tag.text ).parent.findPreviousSibling() if temp_page: page = temp_page.select_one("a") if page: total_page = int(page.string.strip().replace( "...", "")) else: total_page = 1 else: total_page = 1 except: total_page = 1 now_page = 1 while now_page <= total_page: if response.request.meta["city_id"] == "": request = Request( url= "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" + response.request.meta["province_id"] + "&p=" + str(now_page) + "&", callback="get_content", priority=2) request.meta["city_name"] = response.request.meta[ "city_name"] request.meta["city_id"] = response.request.meta["city_id"] request.meta["province_name"] = response.request.meta[ "province_name"] request.meta["province_id"] = response.request.meta[ "province_id"] yield request else: request = Request( url= "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&p=" + str(now_page) + "&province=" + response.request.meta["province_id"] + "&city=" + response.request.meta["city_id"] + "&", callback="get_content", priority=2) request.meta["city_name"] = response.request.meta[ "city_name"] request.meta["city_id"] = response.request.meta["city_id"] request.meta["province_name"] = response.request.meta[ "province_name"] request.meta["province_id"] = response.request.meta[ "province_id"] yield request now_page += 1 def get_content(self, response): if not response.m_response: logger.error(response.request.url) yield response.request if '<script>window.location.href=' in response.m_response.content: logger.error(response.m_response.content + "\n" + response.request.url) yield response.request soup = bs(response.m_response.content, "lxml") content_list = soup.select("table.m_srchList tbody tr") for content in content_list: try: result_item = dict() result_item["province"] = response.request.meta[ "province_name"] result_item["city"] = response.request.meta["city_name"] result_item["company_name"] = content.select( "td")[1].text.split('\n')[0].strip() result_item["company_man"] = content.select( "td")[1].text.split('\n')[1].strip().replace("企业法人:", "") result_item["company_telephone"] = content.select( "td")[1].text.split('\n')[2].strip().replace("联系方式:", "") result_item["company_address"] = content.select( "td")[1].text.split('\n')[3].strip() if "地址:" in result_item["company_address"]: result_item["company_address"] = result_item[ "company_address"].replace("地址:", "") else: result_item["company_address"] = "" result_item["company_registered_capital"] = content.select( "td")[2].text.strip() result_item["company_registered_time"] = content.select( "td")[3].text.strip() result_item["company_status"] = content.select( "td")[4].text.strip() result_item["source"] = "企查查" result_item["update_time"] = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) yield result_item except Exception: print traceback.format_exc()
def get_city(self, response): if not response.m_response: logger.error(response.request.url) yield response.request if '<script>window.location.href=' in response.m_response.content: logger.error(response.m_response.content + "\n" + response.request.url) yield response.request if response.m_response.content == "": request = Request( url= "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" + response.request.meta["province_id"] + "&", callback="get_all_page", priority=1) request.meta["city_name"] = "" request.meta["city_id"] = "" request.meta["province_name"] = response.request.meta[ "province_name"] request.meta["province_id"] = response.request.meta["province_id"] yield request else: soup = bs(response.m_response.content, "lxml") city_list = soup.select("a") for city in city_list: city_name = city.string.strip() city_id = city["data-value"].strip() request = Request( url= "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" + response.request.meta["province_id"] + "&city=" + city_id + "&", callback="get_all_page", priority=1) request.meta["city_name"] = city_name request.meta["city_id"] = city_id request.meta["province_name"] = response.request.meta[ "province_name"] request.meta["province_id"] = response.request.meta[ "province_id"] yield request
def download_pic(self, response): if response.m_response: href = bs(response.m_response.content, "lxml").select_one("div.main-image img").attrs["src"] yield Request(url=href, callback=self.download, priority=3)
def process_page_3(self, response): soup = bs(response.m_response.content, 'lxml') car_info_list = soup.select('div#a2 ul#viewlist_ul li a.carinfo') for car_info in car_info_list: url = 'http://www.che168.com' + car_info['href'] request = Request(url=url, priority=4, callback=self.process_page_4) request.meta['province'] = response.request.meta['province'] request.meta['city'] = response.request.meta['city'] request.meta['brand'] = response.request.meta['brand'] request.meta['cars_line'] = response.request.meta['cars_line'] yield request next_page = soup.find( lambda tag: tag.name == 'a' and '下一页' in tag.text) if next_page: url = 'http://www.che168.com' + next_page['href'] request = Request(url=url, priority=3, callback=self.process_page_3) request.meta['province'] = response.request.meta['province'] request.meta['city'] = response.request.meta['city'] request.meta['brand'] = response.request.meta['brand'] request.meta['cars_line'] = response.request.meta['cars_line'] yield request
class Car_Processor(BaseProcessor): spider_id = 'car_spider' spider_name = 'car_spider' allowed_domains = ['che168.com'] start_requests = [Request(url='http://www.che168.com', priority=0)] @checkResponse def process(self, response): soup = bs(response.m_response.content, 'lxml') province_div_list = soup.select( 'div.city-list div.cap-city > div.fn-clear') for province_div in province_div_list: province_name = province_div.select('span.capital a')[0].text city_list = province_div.select('div.city a') for city in city_list: city_name = city.text pinyin = city['href'].strip('/').split('/')[0] request = Request( url= 'http://www.che168.com/handler/usedcarlistv5.ashx?action=brandlist&area=%s' % pinyin, priority=1, callback=self.process_page_1) request.meta['province'] = province_name request.meta['city'] = city_name yield request @checkResponse def process_page_1(self, response): brand_list = list( json.loads(response.m_response.content.decode('gb2312'))) for brand in brand_list: brand_dict = dict(brand) brand_name = brand_dict['name'] url = response.nice_join(brand_dict['url']) + '/' request = Request(url=url, priority=2, callback=self.process_page_2) request.meta['province'] = response.request.meta['province'] request.meta['city'] = response.request.meta['city'] request.meta['brand'] = brand_name yield request @checkResponse def process_page_2(self, response): soup = bs(response.m_response.content, 'lxml') cars_line_list = soup.select( 'div#series div.content-area dl.model-list dd a') for cars_line in cars_line_list: cars_line_name = cars_line.text url = 'http://www.che168.com' + cars_line['href'] request = Request(url=url, priority=3, callback=self.process_page_3) request.meta['province'] = response.request.meta['province'] request.meta['city'] = response.request.meta['city'] request.meta['brand'] = response.request.meta['brand'] request.meta['cars_line'] = cars_line_name yield request @checkResponse def process_page_3(self, response): soup = bs(response.m_response.content, 'lxml') car_info_list = soup.select('div#a2 ul#viewlist_ul li a.carinfo') for car_info in car_info_list: url = 'http://www.che168.com' + car_info['href'] request = Request(url=url, priority=4, callback=self.process_page_4) request.meta['province'] = response.request.meta['province'] request.meta['city'] = response.request.meta['city'] request.meta['brand'] = response.request.meta['brand'] request.meta['cars_line'] = response.request.meta['cars_line'] yield request next_page = soup.find( lambda tag: tag.name == 'a' and '下一页' in tag.text) if next_page: url = 'http://www.che168.com' + next_page['href'] request = Request(url=url, priority=3, callback=self.process_page_3) request.meta['province'] = response.request.meta['province'] request.meta['city'] = response.request.meta['city'] request.meta['brand'] = response.request.meta['brand'] request.meta['cars_line'] = response.request.meta['cars_line'] yield request @checkResponse def process_page_4(self, response): soup = bs(response.m_response.content.decode('gb2312', 'ignore'), 'lxml') # <html><head><title>Object moved</title></head><body> # <h2>Object moved to <a href="/CarDetail/wrong.aspx?errorcode=5&backurl=/&infoid=21415515">here</a>.</h2> # </body></html> if len(soup.select('div.car-title h2')) != 0: car = soup.select('div.car-title h2')[0].text detail_list = soup.select('div.details li') if len(detail_list) == 0: soup = bs(response.m_response.content, 'html5lib') detail_list = soup.select('div.details li') mileage = detail_list[0].select('span')[0].text.replace('万公里', '') first_borad_date = detail_list[1].select('span')[0].text gear = detail_list[2].select('span')[0].text.split('/')[0] displacement = detail_list[2].select('span')[0].text.split('/')[1] price = soup.select('div.car-price ins')[0].text.replace('¥', '') crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) item = dict() item['car'] = car item['mileage'] = mileage item['first_borad_date'] = first_borad_date item['gear'] = gear item['displacement'] = displacement item['price'] = price item['crawl_date'] = crawl_date item['province'] = response.request.meta['province'] item['city'] = response.request.meta['city'] item['brand'] = response.request.meta['brand'] item['cars_line'] = response.request.meta['cars_line'] yield item
def get_page_content(self, response): if response.m_response: soup = bs(response.m_response.content, 'lxml') li_list = soup.select("div.postlist ul#pins li") for li in li_list: yield Request(url=li.select_one("a").attrs["href"], callback=self.get_pic, priority=1)
def process(self, response): if response.m_response: soup = bs(response.m_response.content, "lxml") total_page = int(soup.select_one("a.next.page-numbers").find_previous_sibling().text) for page in range(1, total_page + 1): yield Request(url="http://www.mzitu.com/xinggan/page/" + str(page), callback=self.get_page_content)