Пример #1
0
    def __init__(self):
        CrawlSpider.__init__(self)
        self.verification_errors = []
        self.selenium = selenium("localhost", 4444, "*firefox", "http://yue.fm/")
        self.selenium.start(driver=webdriver.Chrome())

        self.count = 0
Пример #2
0
    def __init__(self, city, num, plusDate):
        self.city = city
        self.num = num
        self.plusDate = int(plusDate)

        CrawlSpider.__init__(self)
        self.startWebDriver()
Пример #3
0
    def __init__(self):
        CrawlSpider.__init__(self)
	print "szlibspider start"
        self.verificationErrors = []
        self.selenium = selenium("localhost", 4444, "*firefox /usr/lib/firefox/firefox", "http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html")
	ffdriver = Firefox()
	self.selenium.start(driver=ffdriver)
#	self.selenium.start()


        sel = self.selenium
#        sel.open("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html")
        ffdriver.get("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html")
	WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting page to load")	

        #Wait for javscript to load in Selenium
#        time.sleep(20)
#	sel.wait_for_condition("condition by js", 20000);
#	print "ul/li visible? %s" % sel.is_element_present("//ul[@class='servicepointlist']")

	elements = ffdriver.find_elements_by_xpath("//ul[@class='servicepointlist']/li[@class='item']")
	for element in elements[:5]:
		print "%s" % element.find_element_by_class_name("num").text
		print "%s" % element.find_element_by_class_name("title").text
		print "%s" % element.find_element_by_class_name("text").text
		print "---------------"
Пример #4
0
    def __init__(self, root, date, **kwargs):
        # super(MySpider, self).__init__(*args, **kwargs)
        CrawlSpider.__init__(self, **kwargs)

        domain = get_domain(root)
        self.scrape_domain = domain
        self.unix_date = date

        self.urls_list = []

        conn = sqlite3.connect(db_file)
        c = conn.cursor()
        insert_row(c, "INSERT INTO scrapes (id, domain, date) VALUES (?, ?, ?)", (None, domain, self.unix_date))
        self.scrapeid = c.lastrowid
        log.msg("scrapeid = {0}".format(self.scrapeid))
        conn.commit()
        conn.close()

        self.long_count = 0
        self.seed = URL(
            root,
            self.scrapeid,
            self.long_count,
            base={"protocol": "http://", "subdomain": "", "domain": domain, "path": ""},
        )
        self.start_urls = [self.seed.full]
        self.allowed_domains = [domain, "facebook.com", "twitter.com"]
        self.long_count = self.seed.long_count
Пример #5
0
 def __init__(self, xpath_dict={}, files=None):
     CrawlSpider.__init__(self)
     self.xpath_dict = xpath_dict
     self.from_url_file = files
     self.savingPipe = SavingPipeline()
     if self.from_url_file:
         self.crawl_from_files()
Пример #6
0
 def __init__(self):
     CrawlSpider.__init__(self)
     self.verificationErrors = []
     #create a profile with specific add-ons
     #and do this. Firefox to load it
     profile = FirefoxProfile(profile_directory="/Library/Python/2.6/site-packages/selenium/webdriver/firefox")
     self.selenium = webdriver.Firefox(profile)
Пример #7
0
 def __init__(self, fromCity, toCity, plusDate):
     self.fromCity = fromCity.upper()
     self.toCity = toCity.upper()
     self.plusDate = int(plusDate)
    
     CrawlSpider.__init__(self)
     self.startWebDriver()
Пример #8
0
    def __init__(self, city, plusDate, nightCount):
        self.city = city
        self.plusDate = int(plusDate)
        self.nightCount = nightCount 

        CrawlSpider.__init__(self)       
        self.startWebDriver()
Пример #9
0
 def __init__(self, **kwargs):
     species = kwargs.pop("species", None)
     if species is None:
         raise NotConfigured
     self.species = species.lower()
     CrawlSpider.__init__(self, **kwargs)
     dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
     self.index = defaultdict(list)
Пример #10
0
    def __init__(self, fromCity, toCity, dateStart, dateEnd):
        self.fromCity = fromCity
        self.toCity = toCity
        self.dateStart = int(dateStart)
        self.dateEnd = int(dateEnd)

        CrawlSpider.__init__(self)
        self.startWebDriver()
Пример #11
0
 def __init__(self, xpath_dict={}, files=None):
     CrawlSpider.__init__(self)
     self.xpath_dict = xpath_dict
     self.from_url_file = files
     self.savingPipe = SavingPipeline()
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     if self.from_url_file:
         self.crawl_from_files()
Пример #12
0
    def __init__(self):
        CrawlSpider.__init__(self)
	log.start(logfile="./log/szlib-%s.log" % strftime("%m%d-%H-%M",localtime(time())), loglevel=log.INFO,logstdout=False)
	log.msg("szlibspider start")
	print "szlibspider start"
        self.verificationErrors = []
        self.selenium = selenium("localhost", 4444, "*firefox /usr/lib/firefox/firefox", "http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html")
	ffdriver = Firefox()
	self.selenium.start(driver=ffdriver)
#	self.selenium.start()


        sel = self.selenium
#        sel.open("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html")
        ffdriver.get("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html")
	WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting page to load")	

        #Wait for javscript to load in Selenium
#        time.sleep(20)
#	sel.wait_for_condition("condition by js", 20000);
#	print "ul/li visible? %s" % sel.is_element_present("//ul[@class='servicepointlist']")

	elements = ffdriver.find_elements_by_xpath("//div[@class='boxtext']/div[@class='filterbox_1']/div[@class='text tab_4_tit district_list']/a")
	num = "wijefowaeofjwejf SSL0011"
	selflibs_num = []
	for district in elements[1:]:
		log.msg("%s selflibs:" % district.text)
		log.msg("==================")
		district.click()
		WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting to load selflibs")	
		selflibs_elements = ffdriver.find_elements_by_xpath("//ul[@class='servicepointlist']/li[@class='item']")
		for selflib_ele in selflibs_elements:
#			num = selflib_ele.find_element_by_class_name("num").text
			num = selflib_ele.find_element_by_class_name("num").get_attribute("textContent")
			log.msg("num %s" % num)
			selflibs_num.append(num[-7:])
			log.msg("numid %s" % num[-7:] )
			log.msg("%s" % selflib_ele.find_element_by_class_name("title").get_attribute("textContent"))
			log.msg("%s" % selflib_ele.find_element_by_class_name("text").get_attribute("textContent"))
			log.msg("---------------")


	log.msg("------1---------")
#	ffdriver.quit()
#	numstr = unicode("编号","utf-8")
#	numstr = unicode(num,"utf-8")
#	log.msg("numstr is in num? %s" % (numstr in num))
#	log.msg("%s,%s, %s" % (num,num[1], num[-7:]))

	for selflibnum in selflibs_num:
		selflib_url ="http://www.szlib.gov.cn/libraryNetwork/dispSelfLibBook/id-5/%s.html" % selflibnum 
		log.msg("selflib url %s" % selflib_url)
	        ffdriver.get(selflib_url)
		WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting to load booklist")	
		categorys_elements = ffdriver.find_elements_by_xpath("//div[@class='boxtext']/div[@class='filterbox_1']/div[@class='text tab_4_tit category']/a")
		for category_ele in categorys_elements[1:]:
			log.msg("%s" % category_ele.text)
Пример #13
0
    def __init__(self, url=None, db_name='tags.db', *args, **kwargs):
        CrawlSpider.__init__(self)

        #If name was not provided, default to a name
        self.db_name = db_name

        #Define space in which spider can crawl
        #Also define space in which spider begins to crawl
        self.add_url(url)
Пример #14
0
    def __init__(self):
        CrawlSpider.__init__(self)
        self.verificationErrors = []

        #dispatcher.connect(self.spider_opened, signals.spider_opened)
        #dispatcher.connect(self.spider_closed, signals.spider_closed)

        xmlfiles = self.get_xml_files()
        
        for xmlfile in xmlfiles:
            self.start_urls.append(xmlfile)
Пример #15
0
	def __init__(self):
		CrawlSpider.__init__(self)	# inizializzo il baseSpider con il metodo originale (sto riscrivendo il metodo '__init__()')
		self.verificationErrors = []
		
		# --- Disattivare l'apertura del brawser -------------------------------------
		# Funziona soltatno con Linux, per via delle dipendenze grafiche...
		# self.display = Display(visible=0,backend ='xvnb', size=(800, 600))
		# self.display = Display(visible=0, size=(800, 600))
		# self.display.start()
		# ----------------------------------------------------------------------------
		self.driver = webdriver.Firefox(self.disableImages()) # carico il webdriver con il profilo che crea la funzione 'disableImages()'
Пример #16
0
 def __init__(self):
     CrawlSpider.__init__(self)
     print settings.DATABASE['HOST']
     conn = MySQLdb.connect(host = settings.DATABASE['HOST'], user = settings.DATABASE['USER'], \
         passwd = settings.DATABASE['PASSWORD'], db = settings.DATABASE['DBNAME'], charset = settings.DATABASE['CHARSET'])
     cursor = conn.cursor()
     cursor.execute("SELECT crawled_url FROM des_city")
     parent_url_list = cursor.fetchall()
     for url in parent_url_list:
         #print url[0]
         self.start_urls.append(url[0] + '/jingdian')
     for url in self.start_urls:
         print url
Пример #17
0
 def __init__(self,
              url,
              itemSelector,
              spiderID,
              spiderName="ScrapySinglePageCrawler",
              **kwargs):
     BaseCrawler.__init__(self, [url], spiderName, spiderID, **kwargs)
     CrawlSpider.__init__(self)
     self.item_extractor = ItemExtractor(itemSelector, self.item_loader,
                                         SpiderTypes.TYPE_SCRAPY,
                                         spiderName, self._id)
     self.url = url
     self.start_urls = [url]
Пример #18
0
	def __init__(self, *args, **kwargs):
		
		'''
		Override the default constructor in order
		to populate the allowed_domains and start_urls
		lists
		'''
		
		CrawlSpider.__init__(self, *args, **kwargs)
		domains = Domain.objects.all()
		for domain in domains:
			self.allowed_domains.append(str(domain.domain).replace("http://","").rstrip("/"))
			self.start_urls.append(str(domain.domain).rstrip("/"))
Пример #19
0
 def __init__(self):
     self.producer = Producer.objects.get(pk=1)
     self.brand = Brand.objects.get(pk=1)
     self.forged_cookie = dict(country="CHIM",
         SialLocaleDef="CountryCode~CN|WebLang~-7|",
         SessionPersistence="""CLICKSTREAMCLOUD%3A%3DvisitorId%3Danonymous%7CPROFILEDATA%3A%3D
         avatar%3D%2Fetc%2Fdesigns%2Fdefault%2Fimages%2Fcollab%2Favatar.png%2CauthorizableId%3D
         anonymous%2CauthorizableId_xss%3Danonymous%2CformattedName%3D%2CformattedName_xss%3D%7C
         SURFERINFO%3A%3DIP%3D141.247.239.190%2Ckeywords%3D%2Cbrowser%3DUnresolved%2COS%3DMac%20OS
         %20X%2Cresolution%3D1440x900%7C""", 
         GUID="415dfb24-e4f2-4218-a5d7-b2943d012103|NULL|1380870456876", 
         cmTPSet="Y")
     CrawlSpider.__init__(self)
Пример #20
0
 def __init__(self):
     CrawlSpider.__init__(self)
     print settings.DATABASE['HOST']
     conn = MySQLdb.connect(host = settings.DATABASE['HOST'], user = settings.DATABASE['USER'], \
         passwd = settings.DATABASE['PASSWORD'], db = settings.DATABASE['DBNAME'], charset = settings.DATABASE['CHARSET'])
     cursor = conn.cursor()
     cursor.execute("SELECT crawled_url FROM des_city")
     parent_url_list = cursor.fetchall()
     for url in parent_url_list:
         #print url[0]
         self.start_urls.append(url[0]+'/jingdian')
     for url in self.start_urls:
         print url
Пример #21
0
    def __init__(self, fromCity, toCity, dateStart, dateEnd, maxTries = 7):
        (self.fromCity, self.fromValue, self.fromNation, self.fromRegion, self.fromCityCode) = fromCity.split(",")
        self.fromValue += "#"
        print self.fromCity, self.fromValue, self.fromNation, self.fromRegion, self.fromCityCode
        (self.toCity, self.toValue, self.toNation, self.toRegion, self.toCityCode) = toCity.split(",")
        self.toValue += "#"
        print self.toCity, self.toValue, self.toNation, self.toRegion, self.toCityCode
        self.dateStart = int(dateStart)
        self.dateEnd = int(dateEnd)

        self.maxTries = maxTries

        CrawlSpider.__init__(self)
        self.startWebDriver()
Пример #22
0
    def __init__(self, url = None, db_name = None, filename= None):
        # Initialising the inherited crawler
        CrawlSpider.__init__(self)
        
        self.db_name = db_name
        # Initialising the variable

        if url is not None:
            self.add_url(url)

        # Reading input from a file
        if filename is not None:
            with open(filename) as f:
                lines = f.readlines()
                for line in lines:          
                    self.add_url(line)
Пример #23
0
    def __init__(self, url=None, db_name=None, *args, **kwargs):
        CrawlSpider.__init__(self)

        #If name was not provided, default to a name
        if db_name is None:
            db_name = "contents.db"

        #Database object
        self.database = scrapyDatabase(db_name)

        #Create Content table if it doesn't exist
        self.database.createContentTable('Content')

        #Define space in which spider can crawl
        #Also define space in which spider begins to crawl
        self.add_url(url)
Пример #24
0
    def _requests_to_follow(self, response):

        if getattr(response, "encoding", None) != None:
            # Server does not set encoding for binary files
            # Do not try to follow links in
            # binary data, as this will break Scrapy
            return CrawlSpider._requests_to_follow(self, response)
        else:
            return []
Пример #25
0
 def __init__(self,
              baseURL,
              urlGenerator,
              itemSelector,
              spiderID,
              spiderName="ScrapyPageListCrawler",
              filterPredicate=None,
              **kwargs):
     # get a url from the generator for BaseCrawler to be able to get URL_PARAMS
     BaseCrawler.__init__(self, [baseURL], spiderName, spiderID, **kwargs)
     CrawlSpider.__init__(self)
     self.start_urls = urlGenerator()
     self.item_extractor = FilteringItemExtractor(
         itemSelector,
         self.item_loader,
         SpiderTypes.TYPE_SCRAPY,
         self.name,
         self._id,
         filterPredicate=filterPredicate)
Пример #26
0
 def __init__(self):
     CrawlSpider.__init__(self)
     
     db = MySQLdb.connect(host = Config.mysqlserver, user = Config.mysqlusername, 
                          passwd = Config.mysqlpassword, db = Config.mysqldatabase)
     cursor = db.cursor()
     
     cursor.execute('''
         SELECT *
         FROM foundtypes
     ''')
     
     rows = cursor.fetchall()
     
     for row in rows:
         if len(row) >= 4:
             self.start_urls.append('http://battle.net%s' % row[4])
     cursor.close()
     db.close()
Пример #27
0
    def __init__(self):
        CrawlSpider.__init__(self)
        self.verificationErrors = []
        #create a profile with specific add-ons
        #and do this. Firefox to load it
##        profile = FirefoxProfile(profile_directory="/home/yourUser/.mozilla/firefox/selenium/")
##        self.selenium = webdriver.Firefox(profile)

        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--disable-extensions')
        chrome_options.add_argument('--ignore-certificate-errors')

        self.selenium = webdriver.Chrome(chrome_options=chrome_options, executable_path=r"C:/Users/home/chromedriver.exe")#webdriver.Firefox()
        self.selenium.get("http://www.python.org")
        assert "Python" in self.selenium.title
        elem = self.selenium.find_element_by_name("q")
        elem.clear()
        elem.send_keys("pycon")
        elem.send_keys(Keys.RETURN)
        assert "No results found." not in self.selenium.page_source
    def __init__(self, name=None, **kwargs):
      log.msg(kwargs, level=log.INFO)

      self.min_year = kwargs['min_year'] or 1893
      self.max_year = kwargs['max_year'] or 1924

      log.msg( self.min_year, level=log.INFO)
      log.msg(self.max_year, level=log.INFO)

      self.set_banned_years()
      self.rules += (
        Rule(SgmlLinkExtractor(allow=".+devel.+"), follow=False),
        Rule(SgmlLinkExtractor(allow=self.lower_regex), follow=False),
        Rule(SgmlLinkExtractor(allow=self.upper_regex), follow=False),
        Rule(SgmlLinkExtractor(allow=(self.INDEX_REGEX)), callback='parse_indexed_work'),
        # Si no es ninguna de las anteriores, es una obra y hay que parsearla!
        Rule(SgmlLinkExtractor(allow=(self.CHAPTER_REGEX)), callback='parse_unindexed_work')
      )
      log.msg(self.rules, level=log.INFO)
      # Esto va aca abajo PORQUE SCRAPY ASI LO QUIERE (https://groups.google.com/forum/?fromgroups=#!topic/scrapy-users/Z7PjHuBzmA8)
      CrawlSpider.__init__(self, name)
Пример #29
0
    def __init__(self, fromArea, fromCity, dateStart, dateEnd):
        self.fromCity = fromCity.lower()

        self.fromArea = None
        if fromArea == "europe":
            self.fromArea = u"歐洲"
        elif fromArea in ["asia", "china"]:
            self.fromArea = u"亞洲"
        elif fromArea == "america":
            self.fromArea = u"美洲"
        elif fromArea == "oceania":
            self.fromArea = u"大洋洲"

        self.dateStart = int(dateStart)
        self.dateEnd = int(dateEnd)

        self.destinationCities = []

        self.tickets = []

        CrawlSpider.__init__(self)
        self.startWebDriver()
Пример #30
0
    def __init__(self, name=None, **kwargs):
        log.msg(kwargs, level=log.INFO)

        self.min_year = kwargs['min_year'] or 1893
        self.max_year = kwargs['max_year'] or 1924

        log.msg(self.min_year, level=log.INFO)
        log.msg(self.max_year, level=log.INFO)

        self.set_banned_years()
        self.rules += (
            Rule(SgmlLinkExtractor(allow=".+devel.+"), follow=False),
            Rule(SgmlLinkExtractor(allow=self.lower_regex), follow=False),
            Rule(SgmlLinkExtractor(allow=self.upper_regex), follow=False),
            Rule(SgmlLinkExtractor(allow=(self.INDEX_REGEX)),
                 callback='parse_indexed_work'),
            # Si no es ninguna de las anteriores, es una obra y hay que parsearla!
            Rule(SgmlLinkExtractor(allow=(self.CHAPTER_REGEX)),
                 callback='parse_unindexed_work'))
        log.msg(self.rules, level=log.INFO)
        # Esto va aca abajo PORQUE SCRAPY ASI LO QUIERE (https://groups.google.com/forum/?fromgroups=#!topic/scrapy-users/Z7PjHuBzmA8)
        CrawlSpider.__init__(self, name)
Пример #31
0
    def __init__(self):
        CrawlSpider.__init__(self)

        db = MySQLdb.connect(
            host=Config.mysqlserver, user=Config.mysqlusername, passwd=Config.mysqlpassword, db=Config.mysqldatabase
        )
        cursor = db.cursor()

        cursor.execute(
            """
            SELECT *
            FROM foundtypes
        """
        )

        rows = cursor.fetchall()

        for row in rows:
            if len(row) >= 4:
                self.start_urls.append("http://battle.net%s" % row[4])
        cursor.close()
        db.close()
Пример #32
0
    def __init__(self):

        # init spider
        self.config.read('./configrations.ini')

        self.allowed_domains = ["web.archive.org"]
        self.start_urls = [
            self.config.get('target', 'startUrl'),
            ]
        self.rules = (
            Rule(SgmlLinkExtractor(
                    allow=(r'.*/http://%s/.*' % self.config.get('target', 'domain').replace('.', '\.')),
                    deny_extensions='', # http://www.haogongju.net/art/1690534
                    tags=('a', 'area', 'link', 'script', 'img'),
                    attrs=('href', 'src'),
                    ),
                callback='parse_item',
                follow=True,
                ),
            )

        # call Crawlspider.__init__ to init a real spider
        CrawlSpider.__init__(self)
    def start_requests(self):
        log.start(logfile=self.__smzdm_log_file, loglevel='INFO', logstdout=False)
        smzdm_config = ConfigParser.RawConfigParser()
        smzdm_config.read("configure/smzdm.ini")
        self.price_pattern = re.compile(smzdm_config.get("item_page", "price_pattern").decode("utf-8"))
        self.usd_price_pattern = re.compile(smzdm_config.get("item_page", "usd_price_pattern").decode("utf-8"))
        self.jpy_price_pattern = re.compile(smzdm_config.get("item_page", "jpy_price_pattern").decode("utf-8"))
        self.eur_price_pattern = re.compile(smzdm_config.get("item_page", "eur_price_pattern").decode("utf-8"))
        self.head_separator = smzdm_config.get("item_page", "head_separator_pattern").decode("utf-8")
        self.attachment_pattern = re.compile(smzdm_config.get("item_page", "attachment_pattern").decode("utf-8"))

        config_file_name = "configure/shopping_page.ini"
        shopping_config = ConfigParser.RawConfigParser()
        shopping_config.read(config_file_name)

        for section_name in shopping_config.sections():
            log.msg("Supported url pattern:\t%s" % shopping_config.get(section_name, "url_pattern").decode('utf8'), level=log.DEBUG, spider=SmzdmSpider)
            url_pattern = re.compile(shopping_config.get(section_name, "url_pattern").decode('utf8'))
            title_xpath = shopping_config.get(section_name, "title_xpath")
            price_xpath = shopping_config.get(section_name, "price_xpath")
            price_redudant_pattern = re.compile(shopping_config.get(section_name, "price_redudant_pattern").decode('utf8'))
            description_xpath = shopping_config.get(section_name, "description_xpath")
            description_img_xpath = shopping_config.get(section_name, "description_img_xpath")
            currency = shopping_config.get(section_name, "currency")
            title_img_xpath_list = shopping_config.get(section_name, "title_img_xpath").split(",")
            self.__url_pattern_xpath_dict[url_pattern] = (title_xpath, \
                    price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list)
        CrawlSpider.start_requests(self)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/yingertuiche/youhui/p1', meta={'category': 'stroller'}, callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/anquanzuoyi/youhui/p1', meta={'category': 'car_seat'}, callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/youhui/p1', meta={'category': 'lego'}, callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/huwaibeibao/youhui/p1', meta={'category': 'backpack'}, callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/yingertuiche/haitao/p1', meta={'category': 'stroller'}, callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/anquanzuoyi/haitao/p1', meta={'category': 'car_seat'}, callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/haitao/p1', meta={'category': 'lego'}, callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/huwaibeibao/haitao/p1', meta={'category': 'backpack'}, callback=self.parse_smzdm_list_page)
Пример #34
0
 def test_crawl_spider(self):
     assert issubclass(CrawlSpider, Spider)
     assert issubclass(CrawlSpider, BaseSpider)
     assert isinstance(CrawlSpider(name='foo'), Spider)
     assert isinstance(CrawlSpider(name='foo'), BaseSpider)
Пример #35
0
 def set_crawler(self, crawler):
     CrawlSpider.set_crawler(self, crawler)
     RedisMixin.setup_redis(self)
     self.renderjs = crawler.settings.get("RENDERJS", False)
Пример #36
0
 def set_crawler(self, crawler):
     CrawlSpider.set_crawler(self, crawler)
     #安装redis连接和设置idle signal。
     #并且这个函数必须在spider设置特的crawler object(上面的函数)之后调用。
     RedisMixin.setup_redis(self)
Пример #37
0
 def __del__(self):
     self.selenium.stop()
     print self.verificationErrors
     CrawlSpider.__del__(self)
Пример #38
0
 def __del__(self):
     self.selenium.quit()
     print (self.verificationErrors)
     CrawlSpider.__del__(self)
Пример #39
0
    def start_requests(self):
        log.start(logfile=self.__smzdm_log_file,
                  loglevel='INFO',
                  logstdout=False)
        smzdm_config = ConfigParser.RawConfigParser()
        smzdm_config.read("configure/smzdm.ini")
        self.price_pattern = re.compile(
            smzdm_config.get("item_page", "price_pattern").decode("utf-8"))
        self.usd_price_pattern = re.compile(
            smzdm_config.get("item_page", "usd_price_pattern").decode("utf-8"))
        self.jpy_price_pattern = re.compile(
            smzdm_config.get("item_page", "jpy_price_pattern").decode("utf-8"))
        self.eur_price_pattern = re.compile(
            smzdm_config.get("item_page", "eur_price_pattern").decode("utf-8"))
        self.head_separator = smzdm_config.get(
            "item_page", "head_separator_pattern").decode("utf-8")
        self.attachment_pattern = re.compile(
            smzdm_config.get("item_page",
                             "attachment_pattern").decode("utf-8"))

        config_file_name = "configure/shopping_page.ini"
        shopping_config = ConfigParser.RawConfigParser()
        shopping_config.read(config_file_name)

        for section_name in shopping_config.sections():
            log.msg("Supported url pattern:\t%s" % shopping_config.get(
                section_name, "url_pattern").decode('utf8'),
                    level=log.DEBUG,
                    spider=SmzdmSpider)
            url_pattern = re.compile(
                shopping_config.get(section_name,
                                    "url_pattern").decode('utf8'))
            title_xpath = shopping_config.get(section_name, "title_xpath")
            price_xpath = shopping_config.get(section_name, "price_xpath")
            price_redudant_pattern = re.compile(
                shopping_config.get(section_name,
                                    "price_redudant_pattern").decode('utf8'))
            description_xpath = shopping_config.get(section_name,
                                                    "description_xpath")
            description_img_xpath = shopping_config.get(
                section_name, "description_img_xpath")
            currency = shopping_config.get(section_name, "currency")
            title_img_xpath_list = shopping_config.get(
                section_name, "title_img_xpath").split(",")
            self.__url_pattern_xpath_dict[url_pattern] = (title_xpath, \
                    price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list)
        CrawlSpider.start_requests(self)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/yingertuiche/youhui/p1',
            meta={'category': 'stroller'},
            callback=self.parse_smzdm_list_page)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/anquanzuoyi/youhui/p1',
            meta={'category': 'car_seat'},
            callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/youhui/p1',
                               meta={'category': 'lego'},
                               callback=self.parse_smzdm_list_page)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/huwaibeibao/youhui/p1',
            meta={'category': 'backpack'},
            callback=self.parse_smzdm_list_page)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/yingertuiche/haitao/p1',
            meta={'category': 'stroller'},
            callback=self.parse_smzdm_list_page)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/anquanzuoyi/haitao/p1',
            meta={'category': 'car_seat'},
            callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/haitao/p1',
                               meta={'category': 'lego'},
                               callback=self.parse_smzdm_list_page)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/huwaibeibao/haitao/p1',
            meta={'category': 'backpack'},
            callback=self.parse_smzdm_list_page)
Пример #40
0
 def __init__(self):
     self.count = 0
     CrawlSpider.__init__(self)
     self.verificationErrors = []
Пример #41
0
 def start_requests(self):
     """Combine scrape and start requests."""
     return itertools.chain(CallbackMixin.scrape_requests(self),
                            _CrawlSpider.start_requests(self))
Пример #42
0
 def __init__(self):
     CrawlSpider.__init__(self)
     # use any browser you wish
     self.browser = webdriver.Firefox()
Пример #43
0
 def start_requests(self):
     """CrawlSpider's start_requests() should take precedence over 
     SitemapSpider.
     
     """
     return CrawlSpider.start_requests(self)
Пример #44
0
 def __init__(self, url, allowed_domain):
     self.start_urls.append(url)
     self.allowed_domains.append(allowed_domain)
     CrawlSpider.__init__(self)
Пример #45
0
 def set_crawler(self, crawler):
     CrawlSpider.set_crawler(self, crawler)
     RedisMixin.setup_redis(self)
Пример #46
0
    def set_crawler(self, crawler):

        CrawlSpider.set_crawler(self, crawler)
        self.config_spider()
        crawler.signals.connect(self.print_msg, signal=signals.spider_opened)
Пример #47
0
 def parse(self, response):
     r = list(CrawlSpider.parse(self, response))
     return r + list(self.parse_applist(response))
Пример #48
0
 def parse(self, response):
   r = list(CrawlSpider.parse(self, response))
   return r + list(self.parse_applist(response))
 def __init__(self):
     CrawlSpider.__init__(self)
     self.verificationErrors = []
     self.selenium = selenium("localhost", 4444, "*chrome",
                              "http://www.azlyrics.com/g/guccimane.html")
     self.selenium.start()
Пример #50
0
 def __init__(self):
     self.producer = Producer.objects.get(name='Abnova')
     self.brand = Brand.objects.get(name='Abnova')
     self.forged_cookie = dict(CookiesAbnovaSelectLanguage="CN")
     CrawlSpider.__init__(self)
Пример #51
0
 def __init__(self, *args, **kwargs):
     CrawlSpider.__init__(self, *args, **kwargs)
     self.browser = webdriver.PhantomJS()
     self.prePauseTime = time.time()
Пример #52
0
 def __del__(self):
     self.driver.close()
     print self.verificationErrors
     CrawlSpider.__del__(self)
Пример #53
0
 def __init__(self):
     CrawlSpider.__init__(self)
     self.verificationErrors = []
     self.selenium = selenium("localhost", 4444, "*chrome",
                              "http://www.try.com")
     self.selenium.start()
Пример #54
0
 def __init__(self):
     CrawlSpider.__init__(self)
     self.count = 0
     self.MAX_MOVIE = 2000
Пример #55
0
 def __init__(self):
     CrawlSpider.__init__(self)
     self.verificationErrors = []
     self.selenium = selenium("localhost", 4444, "*firefox",
                              "http://www.jb51.net")
     self.selenium.start()