Python Spider.init示例

编程语言: Python

命名空间/包名称: scrapy

类/类型: Spider

方法/功能: __init__

hotexamples.com的示例: 12

Python Spider.__init__ - 已找到12个示例。这些是从开源项目中提取的最受好评的scrapy.Spider.__init__现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Spider(30)

log(11)

__init__(8)

close(3)

from_crawler(3)

__del__(1)

domain(1)

download_delay(1)

ignore_urls(1)

name(1)

page_actions(1)

partial_form_request(1)

processor(1)

start_requests(1)

start_urls(1)

示例#1

显示文件

文件： __init__.py 项目： vincent-ferotin/scraping-github

    def __init__(self, *args, **kwargs):
        """Spider initialization.
        """
        Spider.__init__(self, *args, **kwargs)

        self.requests = []
        self.responses = []

示例#2

显示文件

文件： facebook_page.py 项目： fanlens/crawler

 def __init__(
         self,  # pylint: disable=too-many-arguments
         source_id: int,
         since: Optional[TSince] = None,
         include_extensions: str = 'comments',
         api_key: Optional[str] = None,
         progress: Optional[ProgressCallbackBase] = None) -> None:
     """
     :param source_id: source id to crawl, must have type 'facebook'
     :param since: since when to crawl
     :param include_extensions: which extensions to include (comments, reactions) as csv string
     :param api_key: fanlens api key, will be deprecated
     :param progress: optional progress callback informing external systems
     """
     Spider.__init__(self, FacebookPageSpider.name)
     GenericMixin.__init__(self,
                           source_id=source_id,
                           since=since,
                           api_key=api_key)
     ProgressMixin.__init__(self, progress=progress)
     self.start_urls = [
         page_feed_url(self.source['slug'],
                       limit=self.limits['post'],
                       since=self.since)
     ]
     self.logger.info('Crawling page %s since %s' %
                      (self.source['slug'], self.since))
     self._included_extensions = {
         Extension[extension_str]
         for extension_str in set(include_extensions.lower().split(
             ',')).intersection(self.allowed_extensions)
     }

示例#3

显示文件

文件： base.py 项目： jarvisji/ScrapyCrawler

 def __init__(self, op, **kwargs):
     self.op = op
     self.reach_limit = False
     self.last_feed_updated_time = None
     self.make_sure_path_exists(self.get_output_dir_path())
     # TODO: why print log in __int__ doesn't work?
     # self.log('Initializing spider...')
     Spider.__init__(self, self.name, **kwargs)

示例#4

显示文件

 def __init__(self, op, **kwargs):
     self.op = op
     self.reach_limit = False
     self.last_feed_updated_time = None
     self.make_sure_path_exists(self.get_output_dir_path())
     # TODO: why print log in __int__ doesn't work?
     # self.log('Initializing spider...')
     Spider.__init__(self, self.name, **kwargs)

示例#5

显示文件

文件： tripadvisor_city_runner.py 项目： rodrigoney/desafiohu2

 def __init__(self, city_name, city_id, api, *args, **kwargs):
     self.api_key = str(api)
     self.city_id = city_id
     self.city_name = city_name
     self.base_url += city_id
     self.averages = {}
     self.top10_restaurants = {}
     self.db_manager = DBManager(self)
     Spider.__init__(self, *args, **kwargs)
     dispatcher.connect(self.spider_closed, signals.spider_closed)

示例#6

显示文件

文件： bignews.py 项目： Shanshan-IC/csf_scraper

    def __init__(self, txt_path=None, *args, **kwargs):
        Spider.__init__(self, *args, **kwargs)

        if not txt_path:
            txt_path = "%s%s%s" % (os.curdir, os.sep, self.name)

        if not os.path.exists(txt_path):
            os.mkdir(txt_path)

        self.txt_path = txt_path

示例#7

显示文件

文件： mysupermarket.py 项目： hmcc/price-search

 def __init__(self, url):
     """
     Set up the spider to start scraping from the given URL. URLs should
     be the first page of "Savvy Buys" for a supermarket and should be
     read from the app.cfg file.
     
     For multiple supermarkets, use multiple spiders.
     
     Keyword arguments:
     url -- a single URL to start from.
     """
     Spider.__init__(self)
     self.start_urls = [url]

示例#8

显示文件

文件： mysupermarket.py 项目： hmcc/price-search

 def __init__(self, url):
     """
     Set up the spider to start scraping from the given URL. URLs should
     be the first page of "Savvy Buys" for a supermarket and should be
     read from the app.cfg file.
     
     For multiple supermarkets, use multiple spiders.
     
     Keyword arguments:
     url -- a single URL to start from.
     """
     Spider.__init__(self)
     self.start_urls = [url]

示例#9

显示文件

 def __init__(self,
              source_id: int,
              since: Optional[TSince] = None,
              api_key: Optional[str] = None,
              progress: Optional[ProgressCallbackBase] = None) -> None:
     """
     :param source_id: source id to crawl, must have type 'facebook'
     :param since: since when to crawl
     :param api_key: fanlens api key, will be deprecated
     :param progress: optional progress callback informing external systems
     """
     Spider.__init__(self, name=TwitterSearchSpider.name)
     GenericMixin.__init__(self,
                           source_id=source_id,
                           since=since,
                           api_key=api_key)
     ProgressMixin.__init__(self, progress=progress)
     self.logger.info('crawling page %s since %s' %
                      (self.source['slug'], self.since))

示例#10

显示文件

文件： tripadvisor_hotel_spider.py 项目： Shokesu/TripAdvisorScraper

    def __init__(self, **kwargs):
        '''
        Inicializa esta instancia.
        :param terms: Es un parámetro opcional que indica los términos de busqueda para
        encontrar hoteles en tripadvisor.
        :param locations: Es un parámetro opcional que indica una localización para encontrar
        hoteles en tripadvisor e.g: "Olite, Navarra" o "Spain"

        Si terms no es None, se escrapearán los hoteles que se encuentren realizando una búsqueda
        por términos.
        Si terms es None, se escrapearán los hoteles que se encuentren realizando una búsqueda
        por localización.
        '''
        Spider.__init__(self)

        config = GlobalConfig()
        self.log = Logger(config.get_path('OUTPUT_SCRAP_LOG'))

        config.override(Config(kwargs))
        config.check()

示例#11

显示文件

文件： pachong.py 项目： yijingping/pycrawler

 def __init__(self):
     Spider.__init__(self)
     self.driver = webdriver.PhantomJS()

示例#12

显示文件

文件： __init__.py 项目： luhaoz/XMCrawlerApp

 def __init__(self):
     Spider.__init__(self, name=self.__class__.script_name())
     self.__class__.spider_log = logger(self.__class__.script_name())
     dispatcher.connect(self.spider_closed, signals.spider_closed)

Python Spider.__init__示例

Python Spider.init示例