def __init__(self): CrawlSpider.__init__(self) self.verification_errors = [] self.selenium = selenium("localhost", 4444, "*firefox", "http://yue.fm/") self.selenium.start(driver=webdriver.Chrome()) self.count = 0
def __init__(self, city, num, plusDate): self.city = city self.num = num self.plusDate = int(plusDate) CrawlSpider.__init__(self) self.startWebDriver()
def __init__(self): CrawlSpider.__init__(self) print "szlibspider start" self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*firefox /usr/lib/firefox/firefox", "http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html") ffdriver = Firefox() self.selenium.start(driver=ffdriver) # self.selenium.start() sel = self.selenium # sel.open("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html") ffdriver.get("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html") WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting page to load") #Wait for javscript to load in Selenium # time.sleep(20) # sel.wait_for_condition("condition by js", 20000); # print "ul/li visible? %s" % sel.is_element_present("//ul[@class='servicepointlist']") elements = ffdriver.find_elements_by_xpath("//ul[@class='servicepointlist']/li[@class='item']") for element in elements[:5]: print "%s" % element.find_element_by_class_name("num").text print "%s" % element.find_element_by_class_name("title").text print "%s" % element.find_element_by_class_name("text").text print "---------------"
def __init__(self, root, date, **kwargs): # super(MySpider, self).__init__(*args, **kwargs) CrawlSpider.__init__(self, **kwargs) domain = get_domain(root) self.scrape_domain = domain self.unix_date = date self.urls_list = [] conn = sqlite3.connect(db_file) c = conn.cursor() insert_row(c, "INSERT INTO scrapes (id, domain, date) VALUES (?, ?, ?)", (None, domain, self.unix_date)) self.scrapeid = c.lastrowid log.msg("scrapeid = {0}".format(self.scrapeid)) conn.commit() conn.close() self.long_count = 0 self.seed = URL( root, self.scrapeid, self.long_count, base={"protocol": "http://", "subdomain": "", "domain": domain, "path": ""}, ) self.start_urls = [self.seed.full] self.allowed_domains = [domain, "facebook.com", "twitter.com"] self.long_count = self.seed.long_count
def __init__(self, xpath_dict={}, files=None): CrawlSpider.__init__(self) self.xpath_dict = xpath_dict self.from_url_file = files self.savingPipe = SavingPipeline() if self.from_url_file: self.crawl_from_files()
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] #create a profile with specific add-ons #and do this. Firefox to load it profile = FirefoxProfile(profile_directory="/Library/Python/2.6/site-packages/selenium/webdriver/firefox") self.selenium = webdriver.Firefox(profile)
def __init__(self, fromCity, toCity, plusDate): self.fromCity = fromCity.upper() self.toCity = toCity.upper() self.plusDate = int(plusDate) CrawlSpider.__init__(self) self.startWebDriver()
def __init__(self, city, plusDate, nightCount): self.city = city self.plusDate = int(plusDate) self.nightCount = nightCount CrawlSpider.__init__(self) self.startWebDriver()
def __init__(self, **kwargs): species = kwargs.pop("species", None) if species is None: raise NotConfigured self.species = species.lower() CrawlSpider.__init__(self, **kwargs) dispatcher.connect(self.spider_closed, signal=signals.spider_closed) self.index = defaultdict(list)
def __init__(self, fromCity, toCity, dateStart, dateEnd): self.fromCity = fromCity self.toCity = toCity self.dateStart = int(dateStart) self.dateEnd = int(dateEnd) CrawlSpider.__init__(self) self.startWebDriver()
def __init__(self, xpath_dict={}, files=None): CrawlSpider.__init__(self) self.xpath_dict = xpath_dict self.from_url_file = files self.savingPipe = SavingPipeline() dispatcher.connect(self.spider_closed, signals.spider_closed) if self.from_url_file: self.crawl_from_files()
def __init__(self): CrawlSpider.__init__(self) log.start(logfile="./log/szlib-%s.log" % strftime("%m%d-%H-%M",localtime(time())), loglevel=log.INFO,logstdout=False) log.msg("szlibspider start") print "szlibspider start" self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*firefox /usr/lib/firefox/firefox", "http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html") ffdriver = Firefox() self.selenium.start(driver=ffdriver) # self.selenium.start() sel = self.selenium # sel.open("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html") ffdriver.get("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html") WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting page to load") #Wait for javscript to load in Selenium # time.sleep(20) # sel.wait_for_condition("condition by js", 20000); # print "ul/li visible? %s" % sel.is_element_present("//ul[@class='servicepointlist']") elements = ffdriver.find_elements_by_xpath("//div[@class='boxtext']/div[@class='filterbox_1']/div[@class='text tab_4_tit district_list']/a") num = "wijefowaeofjwejf SSL0011" selflibs_num = [] for district in elements[1:]: log.msg("%s selflibs:" % district.text) log.msg("==================") district.click() WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting to load selflibs") selflibs_elements = ffdriver.find_elements_by_xpath("//ul[@class='servicepointlist']/li[@class='item']") for selflib_ele in selflibs_elements: # num = selflib_ele.find_element_by_class_name("num").text num = selflib_ele.find_element_by_class_name("num").get_attribute("textContent") log.msg("num %s" % num) selflibs_num.append(num[-7:]) log.msg("numid %s" % num[-7:] ) log.msg("%s" % selflib_ele.find_element_by_class_name("title").get_attribute("textContent")) log.msg("%s" % selflib_ele.find_element_by_class_name("text").get_attribute("textContent")) log.msg("---------------") log.msg("------1---------") # ffdriver.quit() # numstr = unicode("编号","utf-8") # numstr = unicode(num,"utf-8") # log.msg("numstr is in num? %s" % (numstr in num)) # log.msg("%s,%s, %s" % (num,num[1], num[-7:])) for selflibnum in selflibs_num: selflib_url ="http://www.szlib.gov.cn/libraryNetwork/dispSelfLibBook/id-5/%s.html" % selflibnum log.msg("selflib url %s" % selflib_url) ffdriver.get(selflib_url) WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting to load booklist") categorys_elements = ffdriver.find_elements_by_xpath("//div[@class='boxtext']/div[@class='filterbox_1']/div[@class='text tab_4_tit category']/a") for category_ele in categorys_elements[1:]: log.msg("%s" % category_ele.text)
def __init__(self, url=None, db_name='tags.db', *args, **kwargs): CrawlSpider.__init__(self) #If name was not provided, default to a name self.db_name = db_name #Define space in which spider can crawl #Also define space in which spider begins to crawl self.add_url(url)
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] #dispatcher.connect(self.spider_opened, signals.spider_opened) #dispatcher.connect(self.spider_closed, signals.spider_closed) xmlfiles = self.get_xml_files() for xmlfile in xmlfiles: self.start_urls.append(xmlfile)
def __init__(self): CrawlSpider.__init__(self) # inizializzo il baseSpider con il metodo originale (sto riscrivendo il metodo '__init__()') self.verificationErrors = [] # --- Disattivare l'apertura del brawser ------------------------------------- # Funziona soltatno con Linux, per via delle dipendenze grafiche... # self.display = Display(visible=0,backend ='xvnb', size=(800, 600)) # self.display = Display(visible=0, size=(800, 600)) # self.display.start() # ---------------------------------------------------------------------------- self.driver = webdriver.Firefox(self.disableImages()) # carico il webdriver con il profilo che crea la funzione 'disableImages()'
def __init__(self): CrawlSpider.__init__(self) print settings.DATABASE['HOST'] conn = MySQLdb.connect(host = settings.DATABASE['HOST'], user = settings.DATABASE['USER'], \ passwd = settings.DATABASE['PASSWORD'], db = settings.DATABASE['DBNAME'], charset = settings.DATABASE['CHARSET']) cursor = conn.cursor() cursor.execute("SELECT crawled_url FROM des_city") parent_url_list = cursor.fetchall() for url in parent_url_list: #print url[0] self.start_urls.append(url[0] + '/jingdian') for url in self.start_urls: print url
def __init__(self, url, itemSelector, spiderID, spiderName="ScrapySinglePageCrawler", **kwargs): BaseCrawler.__init__(self, [url], spiderName, spiderID, **kwargs) CrawlSpider.__init__(self) self.item_extractor = ItemExtractor(itemSelector, self.item_loader, SpiderTypes.TYPE_SCRAPY, spiderName, self._id) self.url = url self.start_urls = [url]
def __init__(self, *args, **kwargs): ''' Override the default constructor in order to populate the allowed_domains and start_urls lists ''' CrawlSpider.__init__(self, *args, **kwargs) domains = Domain.objects.all() for domain in domains: self.allowed_domains.append(str(domain.domain).replace("http://","").rstrip("/")) self.start_urls.append(str(domain.domain).rstrip("/"))
def __init__(self): self.producer = Producer.objects.get(pk=1) self.brand = Brand.objects.get(pk=1) self.forged_cookie = dict(country="CHIM", SialLocaleDef="CountryCode~CN|WebLang~-7|", SessionPersistence="""CLICKSTREAMCLOUD%3A%3DvisitorId%3Danonymous%7CPROFILEDATA%3A%3D avatar%3D%2Fetc%2Fdesigns%2Fdefault%2Fimages%2Fcollab%2Favatar.png%2CauthorizableId%3D anonymous%2CauthorizableId_xss%3Danonymous%2CformattedName%3D%2CformattedName_xss%3D%7C SURFERINFO%3A%3DIP%3D141.247.239.190%2Ckeywords%3D%2Cbrowser%3DUnresolved%2COS%3DMac%20OS %20X%2Cresolution%3D1440x900%7C""", GUID="415dfb24-e4f2-4218-a5d7-b2943d012103|NULL|1380870456876", cmTPSet="Y") CrawlSpider.__init__(self)
def __init__(self): CrawlSpider.__init__(self) print settings.DATABASE['HOST'] conn = MySQLdb.connect(host = settings.DATABASE['HOST'], user = settings.DATABASE['USER'], \ passwd = settings.DATABASE['PASSWORD'], db = settings.DATABASE['DBNAME'], charset = settings.DATABASE['CHARSET']) cursor = conn.cursor() cursor.execute("SELECT crawled_url FROM des_city") parent_url_list = cursor.fetchall() for url in parent_url_list: #print url[0] self.start_urls.append(url[0]+'/jingdian') for url in self.start_urls: print url
def __init__(self, fromCity, toCity, dateStart, dateEnd, maxTries = 7): (self.fromCity, self.fromValue, self.fromNation, self.fromRegion, self.fromCityCode) = fromCity.split(",") self.fromValue += "#" print self.fromCity, self.fromValue, self.fromNation, self.fromRegion, self.fromCityCode (self.toCity, self.toValue, self.toNation, self.toRegion, self.toCityCode) = toCity.split(",") self.toValue += "#" print self.toCity, self.toValue, self.toNation, self.toRegion, self.toCityCode self.dateStart = int(dateStart) self.dateEnd = int(dateEnd) self.maxTries = maxTries CrawlSpider.__init__(self) self.startWebDriver()
def __init__(self, url = None, db_name = None, filename= None): # Initialising the inherited crawler CrawlSpider.__init__(self) self.db_name = db_name # Initialising the variable if url is not None: self.add_url(url) # Reading input from a file if filename is not None: with open(filename) as f: lines = f.readlines() for line in lines: self.add_url(line)
def __init__(self, url=None, db_name=None, *args, **kwargs): CrawlSpider.__init__(self) #If name was not provided, default to a name if db_name is None: db_name = "contents.db" #Database object self.database = scrapyDatabase(db_name) #Create Content table if it doesn't exist self.database.createContentTable('Content') #Define space in which spider can crawl #Also define space in which spider begins to crawl self.add_url(url)
def _requests_to_follow(self, response): if getattr(response, "encoding", None) != None: # Server does not set encoding for binary files # Do not try to follow links in # binary data, as this will break Scrapy return CrawlSpider._requests_to_follow(self, response) else: return []
def __init__(self, baseURL, urlGenerator, itemSelector, spiderID, spiderName="ScrapyPageListCrawler", filterPredicate=None, **kwargs): # get a url from the generator for BaseCrawler to be able to get URL_PARAMS BaseCrawler.__init__(self, [baseURL], spiderName, spiderID, **kwargs) CrawlSpider.__init__(self) self.start_urls = urlGenerator() self.item_extractor = FilteringItemExtractor( itemSelector, self.item_loader, SpiderTypes.TYPE_SCRAPY, self.name, self._id, filterPredicate=filterPredicate)
def __init__(self): CrawlSpider.__init__(self) db = MySQLdb.connect(host = Config.mysqlserver, user = Config.mysqlusername, passwd = Config.mysqlpassword, db = Config.mysqldatabase) cursor = db.cursor() cursor.execute(''' SELECT * FROM foundtypes ''') rows = cursor.fetchall() for row in rows: if len(row) >= 4: self.start_urls.append('http://battle.net%s' % row[4]) cursor.close() db.close()
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] #create a profile with specific add-ons #and do this. Firefox to load it ## profile = FirefoxProfile(profile_directory="/home/yourUser/.mozilla/firefox/selenium/") ## self.selenium = webdriver.Firefox(profile) chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--disable-extensions') chrome_options.add_argument('--ignore-certificate-errors') self.selenium = webdriver.Chrome(chrome_options=chrome_options, executable_path=r"C:/Users/home/chromedriver.exe")#webdriver.Firefox() self.selenium.get("http://www.python.org") assert "Python" in self.selenium.title elem = self.selenium.find_element_by_name("q") elem.clear() elem.send_keys("pycon") elem.send_keys(Keys.RETURN) assert "No results found." not in self.selenium.page_source
def __init__(self, name=None, **kwargs): log.msg(kwargs, level=log.INFO) self.min_year = kwargs['min_year'] or 1893 self.max_year = kwargs['max_year'] or 1924 log.msg( self.min_year, level=log.INFO) log.msg(self.max_year, level=log.INFO) self.set_banned_years() self.rules += ( Rule(SgmlLinkExtractor(allow=".+devel.+"), follow=False), Rule(SgmlLinkExtractor(allow=self.lower_regex), follow=False), Rule(SgmlLinkExtractor(allow=self.upper_regex), follow=False), Rule(SgmlLinkExtractor(allow=(self.INDEX_REGEX)), callback='parse_indexed_work'), # Si no es ninguna de las anteriores, es una obra y hay que parsearla! Rule(SgmlLinkExtractor(allow=(self.CHAPTER_REGEX)), callback='parse_unindexed_work') ) log.msg(self.rules, level=log.INFO) # Esto va aca abajo PORQUE SCRAPY ASI LO QUIERE (https://groups.google.com/forum/?fromgroups=#!topic/scrapy-users/Z7PjHuBzmA8) CrawlSpider.__init__(self, name)
def __init__(self, fromArea, fromCity, dateStart, dateEnd): self.fromCity = fromCity.lower() self.fromArea = None if fromArea == "europe": self.fromArea = u"歐洲" elif fromArea in ["asia", "china"]: self.fromArea = u"亞洲" elif fromArea == "america": self.fromArea = u"美洲" elif fromArea == "oceania": self.fromArea = u"大洋洲" self.dateStart = int(dateStart) self.dateEnd = int(dateEnd) self.destinationCities = [] self.tickets = [] CrawlSpider.__init__(self) self.startWebDriver()
def __init__(self, name=None, **kwargs): log.msg(kwargs, level=log.INFO) self.min_year = kwargs['min_year'] or 1893 self.max_year = kwargs['max_year'] or 1924 log.msg(self.min_year, level=log.INFO) log.msg(self.max_year, level=log.INFO) self.set_banned_years() self.rules += ( Rule(SgmlLinkExtractor(allow=".+devel.+"), follow=False), Rule(SgmlLinkExtractor(allow=self.lower_regex), follow=False), Rule(SgmlLinkExtractor(allow=self.upper_regex), follow=False), Rule(SgmlLinkExtractor(allow=(self.INDEX_REGEX)), callback='parse_indexed_work'), # Si no es ninguna de las anteriores, es una obra y hay que parsearla! Rule(SgmlLinkExtractor(allow=(self.CHAPTER_REGEX)), callback='parse_unindexed_work')) log.msg(self.rules, level=log.INFO) # Esto va aca abajo PORQUE SCRAPY ASI LO QUIERE (https://groups.google.com/forum/?fromgroups=#!topic/scrapy-users/Z7PjHuBzmA8) CrawlSpider.__init__(self, name)
def __init__(self): CrawlSpider.__init__(self) db = MySQLdb.connect( host=Config.mysqlserver, user=Config.mysqlusername, passwd=Config.mysqlpassword, db=Config.mysqldatabase ) cursor = db.cursor() cursor.execute( """ SELECT * FROM foundtypes """ ) rows = cursor.fetchall() for row in rows: if len(row) >= 4: self.start_urls.append("http://battle.net%s" % row[4]) cursor.close() db.close()
def __init__(self): # init spider self.config.read('./configrations.ini') self.allowed_domains = ["web.archive.org"] self.start_urls = [ self.config.get('target', 'startUrl'), ] self.rules = ( Rule(SgmlLinkExtractor( allow=(r'.*/http://%s/.*' % self.config.get('target', 'domain').replace('.', '\.')), deny_extensions='', # http://www.haogongju.net/art/1690534 tags=('a', 'area', 'link', 'script', 'img'), attrs=('href', 'src'), ), callback='parse_item', follow=True, ), ) # call Crawlspider.__init__ to init a real spider CrawlSpider.__init__(self)
def start_requests(self): log.start(logfile=self.__smzdm_log_file, loglevel='INFO', logstdout=False) smzdm_config = ConfigParser.RawConfigParser() smzdm_config.read("configure/smzdm.ini") self.price_pattern = re.compile(smzdm_config.get("item_page", "price_pattern").decode("utf-8")) self.usd_price_pattern = re.compile(smzdm_config.get("item_page", "usd_price_pattern").decode("utf-8")) self.jpy_price_pattern = re.compile(smzdm_config.get("item_page", "jpy_price_pattern").decode("utf-8")) self.eur_price_pattern = re.compile(smzdm_config.get("item_page", "eur_price_pattern").decode("utf-8")) self.head_separator = smzdm_config.get("item_page", "head_separator_pattern").decode("utf-8") self.attachment_pattern = re.compile(smzdm_config.get("item_page", "attachment_pattern").decode("utf-8")) config_file_name = "configure/shopping_page.ini" shopping_config = ConfigParser.RawConfigParser() shopping_config.read(config_file_name) for section_name in shopping_config.sections(): log.msg("Supported url pattern:\t%s" % shopping_config.get(section_name, "url_pattern").decode('utf8'), level=log.DEBUG, spider=SmzdmSpider) url_pattern = re.compile(shopping_config.get(section_name, "url_pattern").decode('utf8')) title_xpath = shopping_config.get(section_name, "title_xpath") price_xpath = shopping_config.get(section_name, "price_xpath") price_redudant_pattern = re.compile(shopping_config.get(section_name, "price_redudant_pattern").decode('utf8')) description_xpath = shopping_config.get(section_name, "description_xpath") description_img_xpath = shopping_config.get(section_name, "description_img_xpath") currency = shopping_config.get(section_name, "currency") title_img_xpath_list = shopping_config.get(section_name, "title_img_xpath").split(",") self.__url_pattern_xpath_dict[url_pattern] = (title_xpath, \ price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list) CrawlSpider.start_requests(self) yield WebdriverRequest('http://www.smzdm.com/fenlei/yingertuiche/youhui/p1', meta={'category': 'stroller'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest('http://www.smzdm.com/fenlei/anquanzuoyi/youhui/p1', meta={'category': 'car_seat'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/youhui/p1', meta={'category': 'lego'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest('http://www.smzdm.com/fenlei/huwaibeibao/youhui/p1', meta={'category': 'backpack'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest('http://www.smzdm.com/fenlei/yingertuiche/haitao/p1', meta={'category': 'stroller'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest('http://www.smzdm.com/fenlei/anquanzuoyi/haitao/p1', meta={'category': 'car_seat'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/haitao/p1', meta={'category': 'lego'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest('http://www.smzdm.com/fenlei/huwaibeibao/haitao/p1', meta={'category': 'backpack'}, callback=self.parse_smzdm_list_page)
def test_crawl_spider(self): assert issubclass(CrawlSpider, Spider) assert issubclass(CrawlSpider, BaseSpider) assert isinstance(CrawlSpider(name='foo'), Spider) assert isinstance(CrawlSpider(name='foo'), BaseSpider)
def set_crawler(self, crawler): CrawlSpider.set_crawler(self, crawler) RedisMixin.setup_redis(self) self.renderjs = crawler.settings.get("RENDERJS", False)
def set_crawler(self, crawler): CrawlSpider.set_crawler(self, crawler) #安装redis连接和设置idle signal。 #并且这个函数必须在spider设置特的crawler object(上面的函数)之后调用。 RedisMixin.setup_redis(self)
def __del__(self): self.selenium.stop() print self.verificationErrors CrawlSpider.__del__(self)
def __del__(self): self.selenium.quit() print (self.verificationErrors) CrawlSpider.__del__(self)
def start_requests(self): log.start(logfile=self.__smzdm_log_file, loglevel='INFO', logstdout=False) smzdm_config = ConfigParser.RawConfigParser() smzdm_config.read("configure/smzdm.ini") self.price_pattern = re.compile( smzdm_config.get("item_page", "price_pattern").decode("utf-8")) self.usd_price_pattern = re.compile( smzdm_config.get("item_page", "usd_price_pattern").decode("utf-8")) self.jpy_price_pattern = re.compile( smzdm_config.get("item_page", "jpy_price_pattern").decode("utf-8")) self.eur_price_pattern = re.compile( smzdm_config.get("item_page", "eur_price_pattern").decode("utf-8")) self.head_separator = smzdm_config.get( "item_page", "head_separator_pattern").decode("utf-8") self.attachment_pattern = re.compile( smzdm_config.get("item_page", "attachment_pattern").decode("utf-8")) config_file_name = "configure/shopping_page.ini" shopping_config = ConfigParser.RawConfigParser() shopping_config.read(config_file_name) for section_name in shopping_config.sections(): log.msg("Supported url pattern:\t%s" % shopping_config.get( section_name, "url_pattern").decode('utf8'), level=log.DEBUG, spider=SmzdmSpider) url_pattern = re.compile( shopping_config.get(section_name, "url_pattern").decode('utf8')) title_xpath = shopping_config.get(section_name, "title_xpath") price_xpath = shopping_config.get(section_name, "price_xpath") price_redudant_pattern = re.compile( shopping_config.get(section_name, "price_redudant_pattern").decode('utf8')) description_xpath = shopping_config.get(section_name, "description_xpath") description_img_xpath = shopping_config.get( section_name, "description_img_xpath") currency = shopping_config.get(section_name, "currency") title_img_xpath_list = shopping_config.get( section_name, "title_img_xpath").split(",") self.__url_pattern_xpath_dict[url_pattern] = (title_xpath, \ price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list) CrawlSpider.start_requests(self) yield WebdriverRequest( 'http://www.smzdm.com/fenlei/yingertuiche/youhui/p1', meta={'category': 'stroller'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest( 'http://www.smzdm.com/fenlei/anquanzuoyi/youhui/p1', meta={'category': 'car_seat'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/youhui/p1', meta={'category': 'lego'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest( 'http://www.smzdm.com/fenlei/huwaibeibao/youhui/p1', meta={'category': 'backpack'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest( 'http://www.smzdm.com/fenlei/yingertuiche/haitao/p1', meta={'category': 'stroller'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest( 'http://www.smzdm.com/fenlei/anquanzuoyi/haitao/p1', meta={'category': 'car_seat'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/haitao/p1', meta={'category': 'lego'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest( 'http://www.smzdm.com/fenlei/huwaibeibao/haitao/p1', meta={'category': 'backpack'}, callback=self.parse_smzdm_list_page)
def __init__(self): self.count = 0 CrawlSpider.__init__(self) self.verificationErrors = []
def start_requests(self): """Combine scrape and start requests.""" return itertools.chain(CallbackMixin.scrape_requests(self), _CrawlSpider.start_requests(self))
def __init__(self): CrawlSpider.__init__(self) # use any browser you wish self.browser = webdriver.Firefox()
def start_requests(self): """CrawlSpider's start_requests() should take precedence over SitemapSpider. """ return CrawlSpider.start_requests(self)
def __init__(self, url, allowed_domain): self.start_urls.append(url) self.allowed_domains.append(allowed_domain) CrawlSpider.__init__(self)
def set_crawler(self, crawler): CrawlSpider.set_crawler(self, crawler) RedisMixin.setup_redis(self)
def set_crawler(self, crawler): CrawlSpider.set_crawler(self, crawler) self.config_spider() crawler.signals.connect(self.print_msg, signal=signals.spider_opened)
def parse(self, response): r = list(CrawlSpider.parse(self, response)) return r + list(self.parse_applist(response))
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*chrome", "http://www.azlyrics.com/g/guccimane.html") self.selenium.start()
def __init__(self): self.producer = Producer.objects.get(name='Abnova') self.brand = Brand.objects.get(name='Abnova') self.forged_cookie = dict(CookiesAbnovaSelectLanguage="CN") CrawlSpider.__init__(self)
def __init__(self, *args, **kwargs): CrawlSpider.__init__(self, *args, **kwargs) self.browser = webdriver.PhantomJS() self.prePauseTime = time.time()
def __del__(self): self.driver.close() print self.verificationErrors CrawlSpider.__del__(self)
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*chrome", "http://www.try.com") self.selenium.start()
def __init__(self): CrawlSpider.__init__(self) self.count = 0 self.MAX_MOVIE = 2000
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*firefox", "http://www.jb51.net") self.selenium.start()