Пример #1
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# import sys
# import os
#
# sys.path.append(os.path.dirname(os.getcwd()))
from car_processor import Car_Processor
from fang_processor import Fang_Processor
from sasila.system_normal.pipeline.console_pipeline import ConsolePipeline
from sasila.system_normal.spider.request_spider import RequestSpider
from sasila.system_normal.manager import manager
import sasila

spider_car = RequestSpider(Car_Processor()).set_pipeline(ConsolePipeline())
spider_fang = RequestSpider(Fang_Processor()).set_pipeline(ConsolePipeline())
manager.set_spider(spider_car)
manager.set_spider(spider_fang)
sasila.start()
Пример #2
0
                                           time.localtime(time.time()))
                item = dict()
                item['avg_price'] = avg_price
                item['estate'] = estate
                item['area'] = area
                item['layout'] = layout
                item['total_price'] = total_price
                item['crawl_date'] = crawl_date

                item['province'] = response.request.meta['province']
                item['city'] = response.request.meta['city']
                item['district'] = response.request.meta['district']
                item['url'] = response.request.url
                yield item

        next_page = soup.select('a#PageControl1_hlk_next')
        if len(next_page) > 0:
            url = response.nice_join(next_page[0]['href'])
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['province'] = response.request.meta['province']
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = response.request.meta['district']
            yield request


if __name__ == '__main__':
    spider = RequestSpider(Fang_Processor()).set_pipeline(
        ConsolePipeline()).set_pipeline(TextPipelineFang()).start()
Пример #3
0
    def save(self, response):
        if response.m_response:
            soup = bs(response.m_response.content, 'lxml')
            name = soup.select("div.cdiv p")[0].string.strip().split(' ')
            if len(name) > 2:
                province = name[0]
                city = name[1]
                area = name[2]
            elif len(name) > 1:
                province = name[0]
                city = name[0]
                area = name[1]
            else:
                province = name[0]
                city = name[0]
                area = name[0]
            lo = soup.select("div.cdiv p")[1].select("span")[0].string.strip()
            la = soup.select("div.cdiv p")[1].select("span")[1].string.strip()
            data = province + ',' + city + ',' + area + ',' + lo + ',' + la
            print data
            with open('city.txt', 'a+') as fs:
                data = province + ',' + city + ',' + area + ',' + lo + ',' + la
                fs.write(data + '\n')
                print data


fe_spider = RequestSpider(CityLocationProcessor())
if __name__ == '__main__':
    fe_spider.start()
Пример #4
0
from sasila.system_normal.pipeline.pic_pipeline import PicPipeline

from base_processor import BaseProcessor, Rule, LinkExtractor
from sasila.system_normal.downloader.http.spider_request import Request
from bs4 import BeautifulSoup as bs

reload(sys)
sys.setdefaultencoding('utf-8')


class FeProcessor(BaseProcessor):
    spider_id = 'fe'
    spider_name = 'fe'
    allowed_domains = ['58.com']
    start_requests = [Request(url='http://www.58.com/daikuan/changecity/')]

    rules = (
        Rule(LinkExtractor(regex_str=r"http://[a-z]*?.58.com/daikuan/"), priority=0),
        Rule(LinkExtractor(regex_str=r"/daikuan/pn\d+/"), priority=1),
        Rule(LinkExtractor(css_str="table.small-tbimg a.t"), priority=3, callback='save'),
    )

    def save(self, response):
        if response.m_response:
            print bs(response.m_response.content, 'lxml').title.string


fe_spider = RequestSpider(FeProcessor()).set_pipeline(PicPipeline())
if __name__ == '__main__':
    fe_spider.start()
Пример #5
0
            detail_list = soup.select('div.details li')
            if len(detail_list) == 0:
                soup = bs(response.m_response.content, 'html5lib')
                detail_list = soup.select('div.details li')
            mileage = detail_list[0].select('span')[0].text.replace('万公里', '')
            first_borad_date = detail_list[1].select('span')[0].text
            gear = detail_list[2].select('span')[0].text.split('/')[0]
            displacement = detail_list[2].select('span')[0].text.split('/')[1]
            price = soup.select('div.car-price ins')[0].text.replace('¥', '')
            crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))

            item = dict()
            item['car'] = car
            item['mileage'] = mileage
            item['first_borad_date'] = first_borad_date
            item['gear'] = gear
            item['displacement'] = displacement
            item['price'] = price
            item['crawl_date'] = crawl_date

            item['province'] = response.request.meta['province']
            item['city'] = response.request.meta['city']
            item['brand'] = response.request.meta['brand']
            item['cars_line'] = response.request.meta['cars_line']
            yield item


if __name__ == '__main__':
    spider = RequestSpider(Car_Processor()).set_pipeline(
        ConsolePipeline()).set_pipeline(TextPipelineCar()).start()
Пример #6
0
                result_item["company_man"] = content.select(
                    "td")[1].text.split('\n')[1].strip().replace("企业法人:", "")
                result_item["company_telephone"] = content.select(
                    "td")[1].text.split('\n')[2].strip().replace("联系方式:", "")
                result_item["company_address"] = content.select(
                    "td")[1].text.split('\n')[3].strip()
                if "地址:" in result_item["company_address"]:
                    result_item["company_address"] = result_item[
                        "company_address"].replace("地址:", "")
                else:
                    result_item["company_address"] = ""
                result_item["company_registered_capital"] = content.select(
                    "td")[2].text.strip()
                result_item["company_registered_time"] = content.select(
                    "td")[3].text.strip()
                result_item["company_status"] = content.select(
                    "td")[4].text.strip()
                result_item["source"] = "企查查"
                result_item["update_time"] = time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                yield result_item
            except Exception:
                print traceback.format_exc()


qcc_spider = RequestSpider(QccProcessor(), time_sleep=1).set_pipeline(
    KafkaPipeline()).set_pipeline(TextPipeline()).set_pipeline(
        ConsolePipeline())
if __name__ == '__main__':
    qcc_spider.start()
Пример #7
0
    def get_pic(self, response):
        if response.m_response:
            li_soup = bs(response.m_response.content, "lxml")
            if li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text
                            ) is not None:
                total_page = int(li_soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text) \
                                 .find_previous_sibling().text)
                for page in range(1, total_page + 1):
                    yield Request(url=response.request.url + "/" + str(page),
                                  callback=self.download_pic,
                                  priority=2)

    def download_pic(self, response):
        if response.m_response:
            href = bs(response.m_response.content,
                      "lxml").select_one("div.main-image img").attrs["src"]
            yield Request(url=href, callback=self.download, priority=3)

    def download(self, response):
        if response.m_response:
            if response.m_response.status_code == 200:
                yield response.m_response.content


mzitu_spider = RequestSpider(MezituProcessor()).set_pipeline(PicPipeline())

if __name__ == '__main__':
    spider = RequestSpider(MezituProcessor()).set_pipeline(
        PicPipeline()).start()
Пример #8
0
 def test_car_processor(self):
     test_pipeline = TestPipeline()
     RequestSpider(Car_Processor(), test=True).set_pipeline(ConsolePipeline()).set_pipeline(test_pipeline).start()
     self.assertEqual(test_pipeline.result['province'], '上海', '爬取结果,省份为上海')
Пример #9
0
    spider_id = 'test'
    spider_name = 'test'
    allowed_domains = ['mzitu.com']
    start_requests = [Request(url="http://www.mzitu.com/")]

    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')
        a_list = soup.select("a")
        for a in a_list:
            if "href" in a.attrs:
                url = response.nice_join(a["href"])
                if response.is_url(url):
                    yield Request(url=url, callback=self.procces2)

    def procces2(self, response):
        if response.m_response:
            soup = bs(response.m_response.content, 'lxml')
            yield soup.title
            a_list = soup.select("a")
            for a in a_list:
                if "href" in a.attrs:
                    url = response.nice_join(a["href"])
                    if response.is_url(url):
                        yield Request(url=url, callback=self.procces2)
        else:
            print response.request.url


if __name__ == '__main__':
    spider = RequestSpider(FirstProcessor()).set_pipeline(ConsolePipeline()).start()
Пример #10
0
class MezituProcessor(BaseProcessor):
    spider_id = 'mzitu'
    spider_name = 'mzitu'
    allowed_domains = ['mzitu.com', 'meizitu.net']
    start_requests = [Request(url='http://www.mzitu.com/xinggan/')]

    rules = (
        Rule(LinkExtractor(
            regex_str=r"http://i.meizitu.net/\d{4}/\d{2}/[0-9a-z]+.jpg"),
             callback="save",
             priority=3),
        Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+"), priority=1),
        Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/\d+/\d+"),
             priority=2),
        Rule(LinkExtractor(regex_str=r"http://www.mzitu.com/xinggan/page/\d+"),
             priority=0),
    )

    def save(self, response):
        if response.m_response:
            if not os.path.exists("img"):
                os.mkdir("img")
            with open("img/" + str(uuid.uuid1()) + ".jpg", 'wb') as fs:
                fs.write(response.m_response.content)
                print("download success!")


if __name__ == '__main__':
    spider = RequestSpider(MezituProcessor(),
                           batch_size=10).set_pipeline(PicPipeline()).start()