def __init__(self, *args, **kwargs): """Spider initialization. """ Spider.__init__(self, *args, **kwargs) self.requests = [] self.responses = []
def __init__( self, # pylint: disable=too-many-arguments source_id: int, since: Optional[TSince] = None, include_extensions: str = 'comments', api_key: Optional[str] = None, progress: Optional[ProgressCallbackBase] = None) -> None: """ :param source_id: source id to crawl, must have type 'facebook' :param since: since when to crawl :param include_extensions: which extensions to include (comments, reactions) as csv string :param api_key: fanlens api key, will be deprecated :param progress: optional progress callback informing external systems """ Spider.__init__(self, FacebookPageSpider.name) GenericMixin.__init__(self, source_id=source_id, since=since, api_key=api_key) ProgressMixin.__init__(self, progress=progress) self.start_urls = [ page_feed_url(self.source['slug'], limit=self.limits['post'], since=self.since) ] self.logger.info('Crawling page %s since %s' % (self.source['slug'], self.since)) self._included_extensions = { Extension[extension_str] for extension_str in set(include_extensions.lower().split( ',')).intersection(self.allowed_extensions) }
def __init__(self, op, **kwargs): self.op = op self.reach_limit = False self.last_feed_updated_time = None self.make_sure_path_exists(self.get_output_dir_path()) # TODO: why print log in __int__ doesn't work? # self.log('Initializing spider...') Spider.__init__(self, self.name, **kwargs)
def __init__(self, city_name, city_id, api, *args, **kwargs): self.api_key = str(api) self.city_id = city_id self.city_name = city_name self.base_url += city_id self.averages = {} self.top10_restaurants = {} self.db_manager = DBManager(self) Spider.__init__(self, *args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, txt_path=None, *args, **kwargs): Spider.__init__(self, *args, **kwargs) if not txt_path: txt_path = "%s%s%s" % (os.curdir, os.sep, self.name) if not os.path.exists(txt_path): os.mkdir(txt_path) self.txt_path = txt_path
def __init__(self, url): """ Set up the spider to start scraping from the given URL. URLs should be the first page of "Savvy Buys" for a supermarket and should be read from the app.cfg file. For multiple supermarkets, use multiple spiders. Keyword arguments: url -- a single URL to start from. """ Spider.__init__(self) self.start_urls = [url]
def __init__(self, source_id: int, since: Optional[TSince] = None, api_key: Optional[str] = None, progress: Optional[ProgressCallbackBase] = None) -> None: """ :param source_id: source id to crawl, must have type 'facebook' :param since: since when to crawl :param api_key: fanlens api key, will be deprecated :param progress: optional progress callback informing external systems """ Spider.__init__(self, name=TwitterSearchSpider.name) GenericMixin.__init__(self, source_id=source_id, since=since, api_key=api_key) ProgressMixin.__init__(self, progress=progress) self.logger.info('crawling page %s since %s' % (self.source['slug'], self.since))
def __init__(self, **kwargs): ''' Inicializa esta instancia. :param terms: Es un parámetro opcional que indica los términos de busqueda para encontrar hoteles en tripadvisor. :param locations: Es un parámetro opcional que indica una localización para encontrar hoteles en tripadvisor e.g: "Olite, Navarra" o "Spain" Si terms no es None, se escrapearán los hoteles que se encuentren realizando una búsqueda por términos. Si terms es None, se escrapearán los hoteles que se encuentren realizando una búsqueda por localización. ''' Spider.__init__(self) config = GlobalConfig() self.log = Logger(config.get_path('OUTPUT_SCRAP_LOG')) config.override(Config(kwargs)) config.check()
def __init__(self): Spider.__init__(self) self.driver = webdriver.PhantomJS()
def __init__(self): Spider.__init__(self, name=self.__class__.script_name()) self.__class__.spider_log = logger(self.__class__.script_name()) dispatcher.connect(self.spider_closed, signals.spider_closed)