Exemplo n.º 1
0
from sasila.system_normal.pipeline.text_pipeline import TextPipelineBendibao

from sasila.system_normal.processor.base_processor import BaseProcessor
from sasila.system_normal.downloader.http.spider_request import Request
from sasila.system_normal.utils.decorator import checkResponse

if sys.version_info < (3, 0):
    reload(sys)
    sys.setdefaultencoding('utf-8')

start_requests_temp = []

with open(name='city.txt', mode='r') as fs:
    lines = fs.readlines()
    for line in lines:
        request_temp = Request(url=line.strip().split(',')[0] + 'wangdian/',
                               priority=0)
        request_temp.meta["city_name"] = line.strip().split(',')[1]
        start_requests_temp.append(request_temp)


class Bendibao_Processor(BaseProcessor):
    spider_id = 'bendibao_spider'
    spider_name = 'bendibao_spider'
    allowed_domains = ['bendibao.com']
    start_requests = start_requests_temp

    @checkResponse
    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')
        category1 = soup.select('div.navlink')
        for category in category1:
Exemplo n.º 2
0
class Fang_Processor(BaseProcessor):
    spider_id = 'fang_spider'
    spider_name = 'fang_spider'
    allowed_domains = ['fang.com']
    start_requests = [
        Request(url='http://esf.gz.fang.com/newsecond/esfcities.aspx',
                priority=0)
    ]

    @checkResponse
    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')
        province_list = {u'四川', u'江苏', u'江西', u'山东', u'广东', u'山西'}
        province_div_list = soup.select('div#c02 ul li')
        for province_div in province_div_list:
            province_name = province_div.select('strong')[0].text
            if province_name != '其他':
                if province_name in province_list:
                    city_list = province_div.select('a')
                    for city in city_list:
                        city_name = city.text
                        url = city['href']
                        request = Request(url=url,
                                          priority=1,
                                          callback=self.process_page_1)
                        request.meta['province'] = province_name
                        request.meta['city'] = city_name
                        yield request

    @checkResponse
    def process_page_1(self, response):
        soup = bs(response.m_response.content, 'lxml')
        district_list = soup.select('div.qxName a')
        district_list.pop(0)
        for district in district_list:
            district_name = district.text
            url = response.request.url + district['href']
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = district_name
            yield request

    @checkResponse
    def process_page_2(self, response):
        soup = bs(response.m_response.content, 'lxml')
        avg_price_list = soup.select('div.newcardR dl')
        if len(avg_price_list) > 0:
            avg_price = avg_price_list[1].select('dd b')[0].text
        else:
            avg_price = '未知'
        detail_list = soup.select('div.houseList dl')
        for detail in detail_list:
            if len(detail.select('p.mt10 a span')) != 0:
                estate = detail.select('p.mt10 a span')[0].text
                area = detail.select('div.area p')[0].text.replace('㎡', '')
                layout = detail.select('p.mt12')[0].text.split('|')[0].strip()
                total_price = detail.select(
                    'div.moreInfo p.mt5 span.price')[0].text
                crawl_date = time.strftime('%Y-%m-%d',
                                           time.localtime(time.time()))
                item = dict()
                item['avg_price'] = avg_price
                item['estate'] = estate
                item['area'] = area
                item['layout'] = layout
                item['total_price'] = total_price
                item['crawl_date'] = crawl_date

                item['province'] = response.request.meta['province']
                item['city'] = response.request.meta['city']
                item['district'] = response.request.meta['district']
                item['url'] = response.request.url
                yield item

        next_page = soup.select('a#PageControl1_hlk_next')
        if len(next_page) > 0:
            url = response.nice_join(next_page[0]['href'])
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = response.request.meta['district']
            yield request
Exemplo n.º 3
0
    def get_all_page(self, response):
        if not response.m_response:
            logger.error(response.request.url)
            yield response.request
        if '<script>window.location.href=' in response.m_response.content:
            logger.error(response.m_response.content + "\n" +
                         response.request.url)
            yield response.request
        else:
            soup = bs(response.m_response.content, "lxml")
            try:
                temp_page = soup.find(lambda tag: tag.name == 'a' and '>' ==
                                      tag.text).parent.findNextSibling()
                if temp_page:
                    page = temp_page.select_one("a")
                    if page:
                        total_page = int(page.string.strip().replace(
                            "...", ""))
                    else:
                        total_page = 1
                else:
                    temp_page = soup.find(
                        lambda tag: tag.name == 'a' and '>' == tag.text
                    ).parent.findPreviousSibling()
                    if temp_page:
                        page = temp_page.select_one("a")
                        if page:
                            total_page = int(page.string.strip().replace(
                                "...", ""))
                        else:
                            total_page = 1
                    else:
                        total_page = 1
            except:
                total_page = 1

            now_page = 1
            while now_page <= total_page:
                if response.request.meta["city_id"] == "":
                    request = Request(
                        url=
                        "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province="
                        + response.request.meta["province_id"] + "&p=" +
                        str(now_page) + "&",
                        callback="get_content",
                        priority=2)
                    request.meta["city_name"] = response.request.meta[
                        "city_name"]
                    request.meta["city_id"] = response.request.meta["city_id"]
                    request.meta["province_name"] = response.request.meta[
                        "province_name"]
                    request.meta["province_id"] = response.request.meta[
                        "province_id"]
                    yield request
                else:
                    request = Request(
                        url=
                        "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&p="
                        + str(now_page) + "&province=" +
                        response.request.meta["province_id"] + "&city=" +
                        response.request.meta["city_id"] + "&",
                        callback="get_content",
                        priority=2)
                    request.meta["city_name"] = response.request.meta[
                        "city_name"]
                    request.meta["city_id"] = response.request.meta["city_id"]
                    request.meta["province_name"] = response.request.meta[
                        "province_name"]
                    request.meta["province_id"] = response.request.meta[
                        "province_id"]
                    yield request
                now_page += 1
Exemplo n.º 4
0
class Fang_Shop_Processor(BaseProcessor):
    spider_id = 'fang_shop_spider'
    spider_name = 'fang_shop_spider'
    allowed_domains = ['fang.com']
    start_requests = [Request(url='http://shop.fang.com', priority=0)]

    @checkResponse
    def process(self, response):
        city_crawl_list = {u'成都', u'南京', u'苏州', u'无锡', u'南昌', u'济南', u'青岛', u'广州', u'东莞'}
        soup = bs('''<a href="http://shop1.fang.com/" style="width:40px;padding:4px 0 4px 8px;">北京</a>
                     <a href="http://shop.sh.fang.com/" style="width:40px;padding:4px 0 4px 8px;">上海</a>
                     <a href="http://shop.gz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">广州</a>
                     <a href="http://shop.sz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">深圳</a>
                     <a href="http://shop.tj.fang.com/" style="width:40px;padding:4px 0 4px 8px;">天津</a>
                     <a href="http://shop.cq.fang.com/" style="width:40px;padding:4px 0 4px 8px;">重庆</a>
                     <a href="http://shop.cd.fang.com/" style="width:40px;padding:4px 0 4px 8px;">成都</a>
                     <a href="http://shop.suzhou.fang.com/" style="width:40px;padding:4px 0 4px 8px;">苏州</a>
                     <a href="http://shop.wuhan.fang.com/" style="width:40px;padding:4px 0 4px 8px;">武汉</a>
                     <a href="http://shop.xian.fang.com/" style="width:40px;padding:4px 0 4px 8px;">西安</a>
                     <a href="http://shop.dg.fang.com/" style="width:40px;padding:4px 0 4px 8px;">东莞</a>
                     <a href="http://shop.km.fang.com/" style="width:40px;padding:4px 0 4px 8px;">昆明</a>
                     <a href="http://shop.hz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">杭州</a>
                     <a href="http://shop.jn.fang.com/" style="width:40px;padding:4px 0 4px 8px;">济南</a>
                     <a href="http://shop.wuxi.fang.com/" style="width:40px;padding:4px 0 4px 8px;">无锡</a>
                     <a href="http://shop.zz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">郑州</a>
                     <a href="http://shop.nc.fang.com/" style="width:40px;padding:4px 0 4px 8px;">南昌</a>
                     <a href="http://shop.qd.fang.com/" style="width:40px;padding:4px 0 4px 8px;">青岛</a>
                     <a href="http://shop.sjz.fang.com/" style="width:40px;padding:4px 0 4px 8px;">石家庄</a>
                     <a href="http://shop.nanjing.fang.com/" style="width:40px;padding:4px 0 4px 8px;">南京</a>
                     <a href="http://shop.dl.fang.com/" style="width:40px;padding:4px 0 4px 8px;">大连</a>''', 'lxml')
        city__list = soup.select('a')
        for city in city__list:
            city_name = city.text
            if city_name in city_crawl_list:
                url = city['href']
                request = Request(url=url, priority=1, callback=self.process_page_1)
                request.meta['city'] = city_name
                yield request

    @checkResponse
    def process_page_1(self, response):
        soup = bs(response.m_response.content, 'lxml')
        district_list = soup.select('div.qxName a')
        district_list.pop(0)
        for district in district_list:
            district_name = district.text
            url = response.request.url + district['href']
            request = Request(url=url, priority=2, callback=self.process_page_2)
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = district_name
            yield request

    @checkResponse
    def process_page_2(self, response):
        soup = bs(response.m_response.content, 'lxml')
        detail_list = soup.select('div.houseList dl')
        for detail in detail_list:
            estate = detail.select('p.mt15 span.spName')[0].text
            detail_str = detail.select('p.mt10')[0].text

            temp_list = detail.select('p.mt10')[0].text.split('/')
            temp_list = [temp.strip() for temp in temp_list]

            if '购物中心/百货' not in detail_str and '层' in detail_str:
                m_type = temp_list[0].replace('类型:', '')
                floor = temp_list[1]
                total_floor = temp_list[2].replace('层', '')
            elif '购物中心/百货' not in detail_str and '层' not in detail_str:
                m_type = temp_list[0].strip().replace('类型:', '')
                floor = '未知'
                total_floor = '未知'
            elif '购物中心/百货' in detail_str and '层' not in detail_str:
                m_type = temp_list[0].replace('类型:', '') + temp_list[1]
                floor = '未知'
                total_floor = '未知'
            elif '购物中心/百货' in detail_str and '层' in detail_str:
                m_type = temp_list[0].replace('类型:', '') + temp_list[1]
                floor = temp_list[2]
                total_floor = temp_list[3].replace('层', '')
            else:
                logger.error('unexpective detail_str: ' + detail_str.strip())

            area = detail.select('div.area')[0].text.replace('㎡', '').replace('建筑面积', '')
            total_price = detail.select('div.moreInfo p.mt5 span.price')[0].text
            crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))

            item = dict()
            item['estate'] = estate
            item['floor'] = floor
            item['total_floor'] = total_floor
            item['type'] = m_type
            item['area'] = area
            item['total_price'] = total_price
            item['crawl_date'] = crawl_date

            item['city'] = response.request.meta['city']
            item['district'] = response.request.meta['district']
            item['url'] = response.request.url
            yield item

        next_page = soup.select('a#PageControl1_hlk_next')
        if len(next_page) > 0:
            url = response.nice_join(next_page[0]['href']) + '/'
            request = Request(url=url, priority=2, callback=self.process_page_2)
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = response.request.meta['district']
            yield request
Exemplo n.º 5
0
class QccProcessor(BaseProcessor):
    spider_id = 'qcc'
    spider_name = 'qcc'
    allowed_domains = ['qichacha.com']

    start_requests = [
        Request(
            url=
            'http://www.qichacha.com/search?key=%E5%B0%8F%E9%A2%9D%E8%B4%B7%E6%AC%BE'
        )
    ]

    def process(self, response):
        if not response.m_response:
            logger.error(response.request.url)
            yield response.request
        if '<script>window.location.href=' in response.m_response.content:
            logger.error(response.m_response.content + "\n" +
                         response.request.url)
            yield response.request
        soup = bs(response.m_response.content, "lxml")
        province_list = soup.select_one("dl#provinceOld").select(
            "div.pull-left")[1].select("dd a")
        for province in province_list:
            province_name = province.string.strip()
            province_id = province["data-value"].strip()
            request = Request(
                url="http://www.qichacha.com/search_getCityListHtml?province="
                + province_id + "&q_type=1",
                callback="get_city",
                priority=0)
            request.meta["province_name"] = province_name
            request.meta["province_id"] = province_id
            yield request

    def get_city(self, response):
        if not response.m_response:
            logger.error(response.request.url)
            yield response.request
        if '<script>window.location.href=' in response.m_response.content:
            logger.error(response.m_response.content + "\n" +
                         response.request.url)
            yield response.request
        if response.m_response.content == "":
            request = Request(
                url=
                "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province="
                + response.request.meta["province_id"] + "&",
                callback="get_all_page",
                priority=1)
            request.meta["city_name"] = ""
            request.meta["city_id"] = ""
            request.meta["province_name"] = response.request.meta[
                "province_name"]
            request.meta["province_id"] = response.request.meta["province_id"]
            yield request
        else:
            soup = bs(response.m_response.content, "lxml")
            city_list = soup.select("a")
            for city in city_list:
                city_name = city.string.strip()
                city_id = city["data-value"].strip()
                request = Request(
                    url=
                    "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province="
                    + response.request.meta["province_id"] + "&city=" +
                    city_id + "&",
                    callback="get_all_page",
                    priority=1)
                request.meta["city_name"] = city_name
                request.meta["city_id"] = city_id
                request.meta["province_name"] = response.request.meta[
                    "province_name"]
                request.meta["province_id"] = response.request.meta[
                    "province_id"]
                yield request

    def get_all_page(self, response):
        if not response.m_response:
            logger.error(response.request.url)
            yield response.request
        if '<script>window.location.href=' in response.m_response.content:
            logger.error(response.m_response.content + "\n" +
                         response.request.url)
            yield response.request
        else:
            soup = bs(response.m_response.content, "lxml")
            try:
                temp_page = soup.find(lambda tag: tag.name == 'a' and '>' ==
                                      tag.text).parent.findNextSibling()
                if temp_page:
                    page = temp_page.select_one("a")
                    if page:
                        total_page = int(page.string.strip().replace(
                            "...", ""))
                    else:
                        total_page = 1
                else:
                    temp_page = soup.find(
                        lambda tag: tag.name == 'a' and '>' == tag.text
                    ).parent.findPreviousSibling()
                    if temp_page:
                        page = temp_page.select_one("a")
                        if page:
                            total_page = int(page.string.strip().replace(
                                "...", ""))
                        else:
                            total_page = 1
                    else:
                        total_page = 1
            except:
                total_page = 1

            now_page = 1
            while now_page <= total_page:
                if response.request.meta["city_id"] == "":
                    request = Request(
                        url=
                        "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province="
                        + response.request.meta["province_id"] + "&p=" +
                        str(now_page) + "&",
                        callback="get_content",
                        priority=2)
                    request.meta["city_name"] = response.request.meta[
                        "city_name"]
                    request.meta["city_id"] = response.request.meta["city_id"]
                    request.meta["province_name"] = response.request.meta[
                        "province_name"]
                    request.meta["province_id"] = response.request.meta[
                        "province_id"]
                    yield request
                else:
                    request = Request(
                        url=
                        "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&p="
                        + str(now_page) + "&province=" +
                        response.request.meta["province_id"] + "&city=" +
                        response.request.meta["city_id"] + "&",
                        callback="get_content",
                        priority=2)
                    request.meta["city_name"] = response.request.meta[
                        "city_name"]
                    request.meta["city_id"] = response.request.meta["city_id"]
                    request.meta["province_name"] = response.request.meta[
                        "province_name"]
                    request.meta["province_id"] = response.request.meta[
                        "province_id"]
                    yield request
                now_page += 1

    def get_content(self, response):
        if not response.m_response:
            logger.error(response.request.url)
            yield response.request
        if '<script>window.location.href=' in response.m_response.content:
            logger.error(response.m_response.content + "\n" +
                         response.request.url)
            yield response.request
        soup = bs(response.m_response.content, "lxml")
        content_list = soup.select("table.m_srchList tbody tr")
        for content in content_list:
            try:
                result_item = dict()
                result_item["province"] = response.request.meta[
                    "province_name"]
                result_item["city"] = response.request.meta["city_name"]
                result_item["company_name"] = content.select(
                    "td")[1].text.split('\n')[0].strip()
                result_item["company_man"] = content.select(
                    "td")[1].text.split('\n')[1].strip().replace("企业法人:", "")
                result_item["company_telephone"] = content.select(
                    "td")[1].text.split('\n')[2].strip().replace("联系方式:", "")
                result_item["company_address"] = content.select(
                    "td")[1].text.split('\n')[3].strip()
                if "地址:" in result_item["company_address"]:
                    result_item["company_address"] = result_item[
                        "company_address"].replace("地址:", "")
                else:
                    result_item["company_address"] = ""
                result_item["company_registered_capital"] = content.select(
                    "td")[2].text.strip()
                result_item["company_registered_time"] = content.select(
                    "td")[3].text.strip()
                result_item["company_status"] = content.select(
                    "td")[4].text.strip()
                result_item["source"] = "企查查"
                result_item["update_time"] = time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                yield result_item
            except Exception:
                print traceback.format_exc()
Exemplo n.º 6
0
 def get_city(self, response):
     if not response.m_response:
         logger.error(response.request.url)
         yield response.request
     if '<script>window.location.href=' in response.m_response.content:
         logger.error(response.m_response.content + "\n" +
                      response.request.url)
         yield response.request
     if response.m_response.content == "":
         request = Request(
             url=
             "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province="
             + response.request.meta["province_id"] + "&",
             callback="get_all_page",
             priority=1)
         request.meta["city_name"] = ""
         request.meta["city_id"] = ""
         request.meta["province_name"] = response.request.meta[
             "province_name"]
         request.meta["province_id"] = response.request.meta["province_id"]
         yield request
     else:
         soup = bs(response.m_response.content, "lxml")
         city_list = soup.select("a")
         for city in city_list:
             city_name = city.string.strip()
             city_id = city["data-value"].strip()
             request = Request(
                 url=
                 "http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province="
                 + response.request.meta["province_id"] + "&city=" +
                 city_id + "&",
                 callback="get_all_page",
                 priority=1)
             request.meta["city_name"] = city_name
             request.meta["city_id"] = city_id
             request.meta["province_name"] = response.request.meta[
                 "province_name"]
             request.meta["province_id"] = response.request.meta[
                 "province_id"]
             yield request
Exemplo n.º 7
0
 def download_pic(self, response):
     if response.m_response:
         href = bs(response.m_response.content,
                   "lxml").select_one("div.main-image img").attrs["src"]
         yield Request(url=href, callback=self.download, priority=3)
Exemplo n.º 8
0
 def process_page_3(self, response):
     soup = bs(response.m_response.content, 'lxml')
     car_info_list = soup.select('div#a2 ul#viewlist_ul li a.carinfo')
     for car_info in car_info_list:
         url = 'http://www.che168.com' + car_info['href']
         request = Request(url=url,
                           priority=4,
                           callback=self.process_page_4)
         request.meta['province'] = response.request.meta['province']
         request.meta['city'] = response.request.meta['city']
         request.meta['brand'] = response.request.meta['brand']
         request.meta['cars_line'] = response.request.meta['cars_line']
         yield request
     next_page = soup.find(
         lambda tag: tag.name == 'a' and '下一页' in tag.text)
     if next_page:
         url = 'http://www.che168.com' + next_page['href']
         request = Request(url=url,
                           priority=3,
                           callback=self.process_page_3)
         request.meta['province'] = response.request.meta['province']
         request.meta['city'] = response.request.meta['city']
         request.meta['brand'] = response.request.meta['brand']
         request.meta['cars_line'] = response.request.meta['cars_line']
         yield request
Exemplo n.º 9
0
class Car_Processor(BaseProcessor):
    spider_id = 'car_spider'
    spider_name = 'car_spider'
    allowed_domains = ['che168.com']
    start_requests = [Request(url='http://www.che168.com', priority=0)]

    @checkResponse
    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')
        province_div_list = soup.select(
            'div.city-list div.cap-city > div.fn-clear')
        for province_div in province_div_list:
            province_name = province_div.select('span.capital a')[0].text
            city_list = province_div.select('div.city a')
            for city in city_list:
                city_name = city.text
                pinyin = city['href'].strip('/').split('/')[0]
                request = Request(
                    url=
                    'http://www.che168.com/handler/usedcarlistv5.ashx?action=brandlist&area=%s'
                    % pinyin,
                    priority=1,
                    callback=self.process_page_1)
                request.meta['province'] = province_name
                request.meta['city'] = city_name
                yield request

    @checkResponse
    def process_page_1(self, response):
        brand_list = list(
            json.loads(response.m_response.content.decode('gb2312')))
        for brand in brand_list:
            brand_dict = dict(brand)
            brand_name = brand_dict['name']
            url = response.nice_join(brand_dict['url']) + '/'
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['brand'] = brand_name
            yield request

    @checkResponse
    def process_page_2(self, response):
        soup = bs(response.m_response.content, 'lxml')
        cars_line_list = soup.select(
            'div#series div.content-area dl.model-list dd a')
        for cars_line in cars_line_list:
            cars_line_name = cars_line.text
            url = 'http://www.che168.com' + cars_line['href']
            request = Request(url=url,
                              priority=3,
                              callback=self.process_page_3)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['brand'] = response.request.meta['brand']
            request.meta['cars_line'] = cars_line_name
            yield request

    @checkResponse
    def process_page_3(self, response):
        soup = bs(response.m_response.content, 'lxml')
        car_info_list = soup.select('div#a2 ul#viewlist_ul li a.carinfo')
        for car_info in car_info_list:
            url = 'http://www.che168.com' + car_info['href']
            request = Request(url=url,
                              priority=4,
                              callback=self.process_page_4)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['brand'] = response.request.meta['brand']
            request.meta['cars_line'] = response.request.meta['cars_line']
            yield request
        next_page = soup.find(
            lambda tag: tag.name == 'a' and '下一页' in tag.text)
        if next_page:
            url = 'http://www.che168.com' + next_page['href']
            request = Request(url=url,
                              priority=3,
                              callback=self.process_page_3)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['brand'] = response.request.meta['brand']
            request.meta['cars_line'] = response.request.meta['cars_line']
            yield request

    @checkResponse
    def process_page_4(self, response):
        soup = bs(response.m_response.content.decode('gb2312', 'ignore'),
                  'lxml')
        # <html><head><title>Object moved</title></head><body>
        # <h2>Object moved to <a href="/CarDetail/wrong.aspx?errorcode=5&amp;backurl=/&amp;infoid=21415515">here</a>.</h2>
        # </body></html>
        if len(soup.select('div.car-title h2')) != 0:
            car = soup.select('div.car-title h2')[0].text
            detail_list = soup.select('div.details li')
            if len(detail_list) == 0:
                soup = bs(response.m_response.content, 'html5lib')
                detail_list = soup.select('div.details li')
            mileage = detail_list[0].select('span')[0].text.replace('万公里', '')
            first_borad_date = detail_list[1].select('span')[0].text
            gear = detail_list[2].select('span')[0].text.split('/')[0]
            displacement = detail_list[2].select('span')[0].text.split('/')[1]
            price = soup.select('div.car-price ins')[0].text.replace('¥', '')
            crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))

            item = dict()
            item['car'] = car
            item['mileage'] = mileage
            item['first_borad_date'] = first_borad_date
            item['gear'] = gear
            item['displacement'] = displacement
            item['price'] = price
            item['crawl_date'] = crawl_date

            item['province'] = response.request.meta['province']
            item['city'] = response.request.meta['city']
            item['brand'] = response.request.meta['brand']
            item['cars_line'] = response.request.meta['cars_line']
            yield item
Exemplo n.º 10
0
 def get_page_content(self, response):
     if response.m_response:
         soup = bs(response.m_response.content, 'lxml')
         li_list = soup.select("div.postlist ul#pins li")
         for li in li_list:
             yield Request(url=li.select_one("a").attrs["href"], callback=self.get_pic, priority=1)
Exemplo n.º 11
0
 def process(self, response):
     if response.m_response:
         soup = bs(response.m_response.content, "lxml")
         total_page = int(soup.select_one("a.next.page-numbers").find_previous_sibling().text)
         for page in range(1, total_page + 1):
             yield Request(url="http://www.mzitu.com/xinggan/page/" + str(page), callback=self.get_page_content)