예제 #1
0
                                           time.localtime(time.time()))
                item = dict()
                item['avg_price'] = avg_price
                item['estate'] = estate
                item['area'] = area
                item['layout'] = layout
                item['total_price'] = total_price
                item['crawl_date'] = crawl_date

                item['province'] = response.request.meta['province']
                item['city'] = response.request.meta['city']
                item['district'] = response.request.meta['district']
                item['url'] = response.request.url
                yield item

        next_page = soup.select('a#PageControl1_hlk_next')
        if len(next_page) > 0:
            url = response.nice_join(next_page[0]['href'])
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = response.request.meta['district']
            yield request


if __name__ == '__main__':
    spider = SpiderCore(Fang_Processor(),
                        test=True).set_pipeline(ConsolePipeline()).start()
예제 #2
0
                                           time.localtime(time.time()))
                item = dict()
                item['avg_price'] = avg_price
                item['estate'] = estate
                item['area'] = area
                item['layout'] = layout
                item['total_price'] = total_price
                item['crawl_date'] = crawl_date

                item['province'] = response.request.meta['province']
                item['city'] = response.request.meta['city']
                item['district'] = response.request.meta['district']
                item['url'] = response.request.url
                yield item

        next_page = soup.select('a#PageControl1_hlk_next')
        if len(next_page) > 0:
            url = response.nice_join(next_page[0]['href'])
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = response.request.meta['district']
            yield request


if __name__ == '__main__':
    spider = RequestSpider(Fang_Processor()).set_pipeline(
        ConsolePipeline()).set_pipeline(TextPipelineFang()).start()
예제 #3
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# import sys
# import os
#
# sys.path.append(os.path.dirname(os.getcwd()))
from car_processor import Car_Processor
from fang_processor import Fang_Processor
from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
from sasila.system_normal.spider.request_spider import RequestSpider
from sasila.system_normal.manager import manager
import sasila

spider_car = RequestSpider(Car_Processor()).set_pipeline(ConsolePipeline())
spider_fang = RequestSpider(Fang_Processor()).set_pipeline(ConsolePipeline())
manager.set_spider(spider_car)
manager.set_spider(spider_fang)
sasila.start()
예제 #4
0
                              callback=self.process_detail)
            request.meta['title'] = title
            request.meta['shortDes'] = shortDes
            request.meta['img_name'] = img_name
            yield request

    @checkResponse
    def process_pic(self, response):
        result = response.m_response.content
        yield pipeItem(['save'], result)

    @checkResponse
    def process_detail(self, response):
        soup = bs(response.m_response.content, 'lxml')

        dd_tail = soup.select('div.zxxwleft p.zxxw2')[0].text.replace(
            '来源: ', '').replace('来源:', '').split(' ')
        date_time = dd_tail[1].strip() + ' ' + dd_tail[2].strip().replace(
            '|', '')
        newsFrom = dd_tail[0].strip()

        result = dict()
        result['date_time'] = date_time
        result['newsFrom'] = newsFrom

        yield pipeItem(['console', 'test'], result)


if __name__ == '__main__':
    SpiderCore(TEST_Processor()).set_pipeline(ConsolePipeline(),'console')\
        .set_pipeline(PicPipeline(),'save').set_pipeline(TestPipeline(),'test').start()
예제 #5
0
            detail_list = soup.select('div.details li')
            if len(detail_list) == 0:
                soup = bs(response.m_response.content, 'html5lib')
                detail_list = soup.select('div.details li')
            mileage = detail_list[0].select('span')[0].text.replace('万公里', '')
            first_borad_date = detail_list[1].select('span')[0].text
            gear = detail_list[2].select('span')[0].text.split('/')[0]
            displacement = detail_list[2].select('span')[0].text.split('/')[1]
            price = soup.select('div.car-price ins')[0].text.replace('¥', '')
            crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))

            item = dict()
            item['car'] = car
            item['mileage'] = mileage
            item['first_borad_date'] = first_borad_date
            item['gear'] = gear
            item['displacement'] = displacement
            item['price'] = price
            item['crawl_date'] = crawl_date

            item['province'] = response.request.meta['province']
            item['city'] = response.request.meta['city']
            item['brand'] = response.request.meta['brand']
            item['cars_line'] = response.request.meta['cars_line']
            yield item


if __name__ == '__main__':
    spider = RequestSpider(Car_Processor()).set_pipeline(
        ConsolePipeline()).set_pipeline(TextPipelineCar()).start()
예제 #6
0
                result_item["company_man"] = content.select(
                    "td")[1].text.split('\n')[1].strip().replace("企业法人:", "")
                result_item["company_telephone"] = content.select(
                    "td")[1].text.split('\n')[2].strip().replace("联系方式:", "")
                result_item["company_address"] = content.select(
                    "td")[1].text.split('\n')[3].strip()
                if "地址:" in result_item["company_address"]:
                    result_item["company_address"] = result_item[
                        "company_address"].replace("地址:", "")
                else:
                    result_item["company_address"] = ""
                result_item["company_registered_capital"] = content.select(
                    "td")[2].text.strip()
                result_item["company_registered_time"] = content.select(
                    "td")[3].text.strip()
                result_item["company_status"] = content.select(
                    "td")[4].text.strip()
                result_item["source"] = "企查查"
                result_item["update_time"] = time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                yield result_item
            except Exception:
                print traceback.format_exc()


qcc_spider = RequestSpider(QccProcessor(), time_sleep=1).set_pipeline(
    KafkaPipeline()).set_pipeline(TextPipeline()).set_pipeline(
        ConsolePipeline())
if __name__ == '__main__':
    qcc_spider.start()
예제 #7
0
                result_mobile = result.find(
                    lambda tag: tag.name == 'p' and '电话:' in tag.text).text
                m_result = dict()
                m_result['result_name'] = result_name
                m_result['result_mobile'] = result_mobile.replace('电话:', '')
                m_result['city_name'] = response.request.meta['city_name']
                m_result['category1_name'] = response.request.meta[
                    'category1_name']
                m_result['category2_name'] = response.request.meta['city_name']
                yield m_result
            next_page = soup.find(
                lambda tag: tag.name == 'a' and '下一页' in tag.text)
            if next_page:
                url_splits = response.request.url.split('/')
                url_splits[-1] = next_page['href']
                url = '/'.join(url_splits)
                request = Request(url=url,
                                  priority=1,
                                  callback=self.process_page_1)
                request.meta['city_name'] = response.request.meta['city_name']
                request.meta['category1_name'] = response.request.meta[
                    'category1_name']
                request.meta['category2_name'] = response.request.meta[
                    'category2_name']
                yield request


if __name__ == '__main__':
    SpiderCore(Bendibao_Processor(), time_sleep=0.5).set_pipeline(
        TextPipelineBendibao()).set_pipeline(ConsolePipeline()).start()
예제 #8
0
    spider_id = 'test'
    spider_name = 'test'
    allowed_domains = ['mzitu.com']
    start_requests = [Request(url="http://www.mzitu.com/")]

    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')
        a_list = soup.select("a")
        for a in a_list:
            if "href" in a.attrs:
                url = response.nice_join(a["href"])
                if response.is_url(url):
                    yield Request(url=url, callback=self.procces2)

    def procces2(self, response):
        if response.m_response:
            soup = bs(response.m_response.content, 'lxml')
            yield soup.title
            a_list = soup.select("a")
            for a in a_list:
                if "href" in a.attrs:
                    url = response.nice_join(a["href"])
                    if response.is_url(url):
                        yield Request(url=url, callback=self.procces2)
        else:
            print response.request.url


if __name__ == '__main__':
    spider = RequestSpider(FirstProcessor()).set_pipeline(ConsolePipeline()).start()
예제 #9
0
 def test_car_processor(self):
     test_pipeline = TestPipeline()
     RequestSpider(Car_Processor(), test=True).set_pipeline(ConsolePipeline()).set_pipeline(test_pipeline).start()
     self.assertEqual(test_pipeline.result['province'], '上海', '爬取结果,省份为上海')
예제 #10
0
파일: main.py 프로젝트: tuian/Sasila
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# import sys
# import os
#
# sys.path.append(os.path.dirname(os.getcwd()))
from car_processor import Car_Processor
from fang_processor import Fang_Processor
from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
from sasila.system_normal.spider.spider_core import SpiderCore
from sasila.system_normal.manager import manager
from sasila import system_web

spider_car = SpiderCore(Car_Processor()).set_pipeline(ConsolePipeline())
spider_fang = SpiderCore(Fang_Processor()).set_pipeline(ConsolePipeline())
manager.set_spider(spider_car)
manager.set_spider(spider_fang)
system_web.start()
예제 #11
0
            detail_list = soup.select('div.details li')
            if len(detail_list) == 0:
                soup = bs(response.m_response.content, 'html5lib')
                detail_list = soup.select('div.details li')
            mileage = detail_list[0].select('span')[0].text.replace('万公里', '')
            first_borad_date = detail_list[1].select('span')[0].text
            gear = detail_list[2].select('span')[0].text.split('/')[0]
            displacement = detail_list[2].select('span')[0].text.split('/')[1]
            price = soup.select('div.car-price ins')[0].text.replace('¥', '')
            crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))

            item = dict()
            item['car'] = car
            item['mileage'] = mileage
            item['first_borad_date'] = first_borad_date
            item['gear'] = gear
            item['displacement'] = displacement
            item['price'] = price
            item['crawl_date'] = crawl_date

            item['province'] = response.request.meta['province']
            item['city'] = response.request.meta['city']
            item['brand'] = response.request.meta['brand']
            item['cars_line'] = response.request.meta['cars_line']
            yield item


if __name__ == '__main__':
    SpiderCore(Car_Processor()).set_pipeline(ConsolePipeline()).set_pipeline(
        TextPipelineCar()).start()
예제 #12
0
 def test_car_processor(self):
     test_pipeline = TestPipeline()
     SpiderCore(Car_Processor(), test=True).set_pipeline(
         ConsolePipeline()).set_pipeline(test_pipeline).start()
     self.assertEqual(len(test_pipeline.result), 11, '爬取结果,11个字段')
예제 #13
0
 def test_car_processor(self):
     test_pipeline = TestPipeline()
     SpiderCore(TEST_Processor(),test=True).set_pipeline(ConsolePipeline(),'console').set_pipeline(PicPipeline(),'save')\
         .set_pipeline(test_pipeline,'test').start()
     self.assertIn('2017',test_pipeline.result['date_time'])
예제 #14
0
파일: main.py 프로젝트: zhhb/Sasila
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import os

sys.path.append(os.getcwd())

from car_processor import Car_Processor
from fang_processor import Fang_Processor
from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
from sasila.system_normal.spider.spider_core import SpiderCore
from sasila.system_normal.manager import manager
from sasila import system_web

if __name__ == '__main__':
    spider_car = SpiderCore(Car_Processor(),
                            batch_size=100).set_pipeline(ConsolePipeline())
    spider_fang = SpiderCore(Fang_Processor()).set_pipeline(ConsolePipeline())
    manager.set_spider(spider_car)
    manager.set_spider(spider_fang)
    system_web.start()