Exemplo n.º 1
0
    def test_priorization(self):
        webdriver = Mock()
        settings = self.settings(WEBDRIVER_BROWSER=webdriver)
        webdriver.get.side_effect = self._wait
        webdriver.page_source = u''

        dispatcher.connect(self._stop_reactor, signal=signals.spider_closed)

        crawler = Crawler(Settings(values=settings))
        crawler.configure()
        spider = self.Spider(name='test', domain='testdomain')
        crawler.crawl(spider)
        crawler.start()
        log.start(loglevel='ERROR')
        reactor.run()

        assert webdriver.get.mock_calls == [
            call('http://testdomain/path?wr=0'),
            call('http://testdomain/path?wr=0&wa=0'),
            call('http://testdomain/path?wr=0&wa=1'),
            call('http://testdomain/path?wr=1'),
            call('http://testdomain/path?wr=1&wa=0'),
            call('http://testdomain/path?wr=1&wa=1'),
            call('http://testdomain/path?wr=0&wa=0&wr=0'),
            call('http://testdomain/path?wr=0&wa=1&wr=0'),
            call('http://testdomain/path?wr=1&wa=0&wr=0'),
            call('http://testdomain/path?wr=1&wa=1&wr=0')]
Exemplo n.º 2
0
 def __init__(self):
     #启动日志
     # log.log.defaultObserver = MyObserver()
     # log.log.defaultObserver.start()
     # log.started = False
     log.start()
     pass
Exemplo n.º 3
0
    def parse_item(self, response):  # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据
        log.start(logfile='log.txt', loglevel=log.WARNING)
        items = []
        sel = Selector(response)
        base_url = get_base_url(response)
        catalog = sel.css('div.box_1 div.sp_13').xpath('text()').extract()[0]
        sites = sel.css('div.centerPadd div.sp_16')
        for site in sites:
            item = GuoShuItem()
            item['siteid'] = self.siteid
            item['sitename'] = self.sitename
            item['name'] = site.css('p a').xpath('text()').extract()[0]
            relative_url = site.css('p a').xpath('@href').extract()[0]
            item['detailurl'] = urlparse.urljoin(
                base_url, relative_url)  #urljoin_rfc(base_url, relative_url)
            item['catalog'] = catalog
            item['guige'] = site.css('.shop').xpath('text()').extract()[0]
            price = site.css('.shop_s2').xpath('text()').extract()
            item['price'] = price[0].split('/')[0].replace("¥", "")
            item['danwei'] = price[0].split('/')[1]
            items.append(item)
            # print repr(item).decode("unicode-escape") + '\n'
            # log.msg('item %s' % repr(item).decode("unicode-escape"),level=log.WARNING)

        # info('parsed ' + str(response))
        return items
Exemplo n.º 4
0
    def __init__(self, cityid=None, info_log=None): 
        if info_log == None:
            raise NotConfigured("HotelScrapy类中: 参数info_log不能为空")
        
        super(HotelScrapy, self)
        
        self.info_log = info_log
	log.start(logfile=self.info_log, loglevel=log.INFO, logstdout=False)
	
	if cityid == None or cityid == "all":
            citys = CityItem.django_model.objects.all()
        else:
	    citys = CityItem.django_model.objects.filter(id=cityid)

        if len(citys) == 0:
            raise NotConfigured("参数cityid:" + cityid + "不存在于表city_city中,请检查")
        
        for city in citys:
            self.city_entrance_urls.append(city.href)
         
        if len(self.city_entrance_urls) > 0:
	    scrapy_item = ScrapyItem()
	    scrapy_item.scrapy_name = self.name
	    if scrapy_item.is_existed_scrapy_name() is False:
	        scrapy_item.save()

	    self.scrapy_batch_item.scrapy_name = self.name
	    self.scrapy_batch_item.batch_number = self.batch_number
	    self.scrapy_batch_item.status = "scrapy_running"
	    self.scrapy_batch_item.save()
Exemplo n.º 5
0
 def __init__(self):
     log.start(logfile=time.strftime("log/%Y%m%d%H%M%S")+".log",logstdout=False)
     log.msg("initiating crawler...",level=log.INFO)
     self.crawler_id = self.get_crawler_id()
     log.msg("crawler id is %s" % self.crawler_id,level=log.INFO)
     self.r.set('crawler:ip:%s' % self.crawler_id,utils.get_external_ip())
     self.r.set('crawler:port:%s' % self.crawler_id,settings.REDIS_LOCAL_PORT)
     self.r.set('crawler:mapping_port:%s' % self.crawler_id,settings.REDIS_LOCAL_MAPPING_PORT)
     log.msg("crawler ip is %s, port is %d" % (utils.get_external_ip(),settings.REDIS_LOCAL_PORT),level=log.INFO)
     account = self.get_account()
     self.username = account[0]
     self.password = account[1]
     log.msg("crawler account got",level=log.INFO)
     self.r_local.set('crawler:status:%s' % self.crawler_id, 'good')
     self.r_local.set('crawler:update_time:%s' % self.crawler_id, datetime.datetime.utcnow().strftime("%s"))
     log.msg("local crawler status set",level=log.INFO)
     heartbeat_thread = threading.Thread(target=self.maintain_local_heartbeat)
     heartbeat_thread.start()
     log.msg("local crawler heartbeat started",level=log.INFO)
     if platform.system() == "Linux":
         #on linux, use virtual display
         vdisplay = Xvfb()
         vdisplay.start()
     co = ChromeOptions()
     #TODO: Disable image after log in
     #TODO: optimize memory usage
     co.add_experimental_option("prefs",{"profile.default_content_settings":{"popups":1}})
     #co.add_experimental_option("prefs",{"profile.default_content_settings":{"popups":1,"images":2,"media":2}})
     self.driver = webdriver.Chrome(chrome_options=co)
     self.driver.set_window_size(640,960)
Exemplo n.º 6
0
def call_spider(file):
    """
    Crea el spider y ejecuta el reactor. Copia los resultados del crawling a los archivos .json para luego
    transformarlos a los archivos data.json correspondientes.
    """
    with open(file, "r") as f:
        list_url = f.readlines()
        domains = []
        urls = []
        created_files = []
        for u in list_url:
            domain = u.strip('\n')
            url_aux = domain.split("/")
            domain_type = False
            if (len(url_aux) > 1):
                domain = url_aux[0]
                url = "http://" + url_aux[0] + "/datos/data"
                if domain == 'www.paraguay.gov.py':
                    url = "http://" + url_aux[0] + "/datos"
            else:
                url = "http://" + u.strip('\n') + "/data"
                domain_type = True
            print "============= Domain " + domain
            print "============= Start url " + url
            response = requests.get(url + "/data.json")
            if response.status_code == 200:
                filename = FileController.FileController(
                ).save_existing_data_json(response, domain, True)
                created_files.append({
                    'modalidad': 'recolecta',
                    'archivo': filename
                })
            else:
                domains.append(domain)
                urls.append(url)

        spider = DataSpider(domains=domains,
                            start_urls=urls,
                            domain_type=domain_type)
        settings = get_project_settings()
        crawler = Crawler(settings)
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.configure()
        crawler.crawl(spider)
        crawler.start()
        log.start(logfile="log.txt", loglevel=log.DEBUG, logstdout=False)
        reactor.run()  # the script will block here
        """ Copiar los datos a los archivos .json """
        data_spider.copy_items_to_files()
        """ Eliminar archivos temporales """
        FileController.FileController().clean_tmp_files()
        """ Convertir los archivos .json a data.json (formato POD) """
        for domain in domains:
            filename = DataJson.DataJson().convert(domain)
            created_files.append({
                'modalidad': 'data-hunting',
                'archivo': filename
            })

        return created_files
def main():
    spider = DrugSynonymsSpider()
    log.start()
    setup_crawler(spider)
    reactor.run()

    items = []
    with codecs.open('drug_synonyms.txt', encoding='utf8', mode='r') as file:
        synonyms = []
        for line in file:
            if line.startswith('synonyms'):
                line = line[10:]
                synonyms = line.split('|')

            elif line.startswith('name'):
                name = line[6:]
                temp_list = [name]
                temp_list.extend(synonyms)
                items.append(temp_list)

    d = {}

    for line in items:
        for word in line:
            raw = list(line)
            raw.remove(word)
            d[word] = raw

    with codecs.open('drug_synonyms_dictionary.txt', encoding='utf8', mode='w') as file:
        for pair in d.items():
            s = pair[0].strip() + '|'
            for word in pair[1]:
                s += word.strip() + ','
            s += '\n'
            file.write(s)
Exemplo n.º 8
0
 def __init__(self):
     log.start(settings.LOG_FILE)
     try:
         engine = db_connect()
         self.Session = sessionmaker(bind=engine)
     except Exception as e:
         pass
    def __init__(self, category=None, *args, **kwargs):
        self.driver = webdriver.Firefox()
        super(SeleniumCrawlerSpider, self).__init__(*args, **kwargs)
                
        LOG_FILE = "scrapy_%s_%s.log" % (self.name, "now")

        # remove the current log
        # log.log.removeObserver(log.log.theLogPublisher.observers[0])
        # re-create the default Twisted observer which Scrapy checks
        log.log.defaultObserver = log.log.DefaultObserver()
        # start the default observer so it can be stopped
        log.log.defaultObserver.start()
        # trick Scrapy into thinking logging has not started
        log.started = False
        # start the new log file observer
        log.start(LOG_FILE)
        # continue with the normal spider init

        #defining the trip "leg" code (Dublin - Liverpool [18] / Liverpool - Dublin [66])
        dcode="18"
        if category == "dublin":
            dcode="18"
        elif category == "liverpool":
            dcode="66"
        
        self.start_urls = ['https://ssl.directferries.com/ferry/secure/multi_price_detail.aspx?stdc=DF10&grid=0&rfid=%s&psgr=1&curr=1&retn=True' % dcode]

        self.log("Init finished")
Exemplo n.º 10
0
def parse_careers(spider):
    crawler = Crawler(Settings())
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    spider.start()
Exemplo n.º 11
0
    def __init__(self, cityid=None, info_log=None):
        if info_log == None:
            raise NotConfigured("ReviewScrapy类中: 参数info_log不能为空d")

	super(ReviewScrapy, self)
	self.info_log = info_log
	if cityid == None or cityid == "all":
	    self.cityid = cityid
	else:
	    city = City.get_city_by_id(cityid)
	    if city == None:
	        raise NotConfigured("参数cityid:" + cityid + "不存在于表city_city中,请检查")
	    else:
	        self.cityid = cityid
	        self.city_name = city.name_ch

	log.start(logfile=info_log, loglevel=log.INFO, logstdout=False)

	# store scrapy
	scrapy_item = ScrapyItem()
	scrapy_item.scrapy_name = self.name
	if scrapy_item.is_existed_scrapy_name() is False:
	    scrapy_item.save()
	# record scrapy status
	self.scrapy_batch_item.scrapy_name = self.name
	self.scrapy_batch_item.batch_number = self.batch_number
	self.scrapy_batch_item.status = "scrapy_running"
	self.scrapy_batch_item.save()
Exemplo n.º 12
0
 def __init__(self, region=None, letterIn=None, *args, **kwargs):
     super(BwPrivateSpider, self).__init__(*args, **kwargs)
     
     ### region, letterIn, start_urls
     self.region = region
     self.letterIn = letterIn
     self.start_urls = [self.symbollookup_url +
                        "&region=" + self.region +
                        "&letterIn=" + self.letterIn + 
                        "&firstrow=" + self.start_firstrow]
 
     ### setting log file: LOG_ROOT/<spider name>/<region>/<letterIn>/<spider name>-<region>-<letterIn>.log
     log_path = os.path.join(settings.LOG_ROOT, self.name, self.region, self.letterIn)
     if not os.path.isdir(log_path):
         os.makedirs(log_path)
     log_file = os.path.join(log_path, '-'.join([self.name, self.region, self.letterIn]) + '.log')
     if os.path.isfile(log_file):
         os.remove(log_file)
     print "log file: ", log_file
     log.start(logfile=log_file, loglevel=log.INFO, logstdout=False)
     
     ### setting json data output: DATA_ROOT/<spider name>/<region>/<letterIn>/<spider name>-<region>-<letterIn>.json
     data_path = os.path.join(settings.DATA_ROOT, self.name, self.region, self.letterIn)
     if not os.path.isdir(data_path):
         os.makedirs(data_path)
     data_file = os.path.join(data_path, '-'.join([self.name, self.region, self.letterIn]) + '.json')
     if os.path.isfile(data_file):
         os.remove(data_file)
     print "data file: ", data_file
     self.data_file = data_file
Exemplo n.º 13
0
    def parse_item(self,
                   response):  # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据 parse_item
        log.start(logfile='log.txt', loglevel=log.WARNING)
        items = []
        sel = Selector(response)
        base_url = get_base_url(response)
        catalog = sel.css('div.cc').xpath('text()').extract()[2]
        catalog = catalog[catalog.index(u'品牌:'):].replace("\r\n", "").replace(
            "品牌:", "").lstrip().rstrip()
        item = GuoShuItem()
        item['siteid'] = self.siteid
        item['sitename'] = self.sitename
        item['name'] = sel.css('div.cc h2').xpath('text()').extract()[0]
        item['detailurl'] = base_url
        item['catalog'] = catalog
        item['guige'] = sel.css('div.cc b').xpath('text()').extract()[0]
        price = sel.css('div.cc').xpath(
            './/font[@color="red"]/text()').extract()[0]
        item['price'] = price
        item['danwei'] = item['guige']
        items.append(item)
        # print repr(item).decode("unicode-escape") + '\n'
        # log.msg('item %s' % repr(item).decode("unicode-escape"),level=log.WARNING)

        # info('parsed ' + str(response))
        return items
    def test_priorization(self):
        webdriver = Mock()
        settings = self.settings(WEBDRIVER_BROWSER=webdriver)
        webdriver.get.side_effect = self._wait
        webdriver.page_source = u""

        dispatcher.connect(self._stop_reactor, signal=signals.spider_closed)

        crawler = Crawler(Settings(values=settings))
        crawler.configure()
        spider = self.Spider(name="test", domain="testdomain")
        crawler.crawl(spider)
        crawler.start()
        log.start(loglevel="ERROR")
        reactor.run()

        # I suspect web actions may be broken...
        assert webdriver.get.mock_calls == [
            call("http://testdomain/path?wr=0"),
            call("http://testdomain/path?wr=0&wa=0"),
            call("http://testdomain/path?wr=0&wa=1"),
            call("http://testdomain/path?wr=1"),
            call("http://testdomain/path?wr=1&wa=0"),
            call("http://testdomain/path?wr=1&wa=1"),
            # call('http://testdomain/path?wr=0&wa=0&wr=0'),
            call("http://testdomain/path?wr=0&wa=1&wr=0"),
            call("http://testdomain/path?wr=0&wa=1&wr=0"),
            # call('http://testdomain/path?wr=1&wa=0&wr=0'),
            call("http://testdomain/path?wr=1&wa=1&wr=0"),
            call("http://testdomain/path?wr=1&wa=1&wr=0"),
        ]
    def __init__(self, category=None, *args, **kwargs):
        self.driver = webdriver.Firefox()
        super(SeleniumCrawlerSpider, self).__init__(*args, **kwargs)

        LOG_FILE = "scrapy_%s_%s.log" % (self.name, "now")

        # remove the current log
        # log.log.removeObserver(log.log.theLogPublisher.observers[0])
        # re-create the default Twisted observer which Scrapy checks
        log.log.defaultObserver = log.log.DefaultObserver()
        # start the default observer so it can be stopped
        log.log.defaultObserver.start()
        # trick Scrapy into thinking logging has not started
        log.started = False
        # start the new log file observer
        log.start(LOG_FILE)
        # continue with the normal spider init

        #defining the trip "leg" code (Dublin - Liverpool [18] / Liverpool - Dublin [66])
        dcode = "18"
        if category == "dublin":
            dcode = "18"
        elif category == "liverpool":
            dcode = "66"

        self.start_urls = [
            'https://ssl.directferries.com/ferry/secure/multi_price_detail.aspx?stdc=DF10&grid=0&rfid=%s&psgr=1&curr=1&retn=True'
            % dcode
        ]

        self.log("Init finished")
Exemplo n.º 16
0
    def __init__(self, cityid=None, info_log=None):
        if info_log == None:
            raise NotConfigured("ReviewScrapy类中: 参数info_log不能为空d")

        super(ReviewScrapy, self)
        self.info_log = info_log
        if cityid == None or cityid == "all":
            self.cityid = cityid
        else:
            city = City.get_city_by_id(cityid)
            if city == None:
                raise NotConfigured("参数cityid:" + cityid +
                                    "不存在于表city_city中,请检查")
            else:
                self.cityid = cityid
                self.city_name = city.name_ch

        log.start(logfile=info_log, loglevel=log.INFO, logstdout=False)

        # store scrapy
        scrapy_item = ScrapyItem()
        scrapy_item.scrapy_name = self.name
        if scrapy_item.is_existed_scrapy_name() is False:
            scrapy_item.save()
# record scrapy status
        self.scrapy_batch_item.scrapy_name = self.name
        self.scrapy_batch_item.batch_number = self.batch_number
        self.scrapy_batch_item.status = "scrapy_running"
        self.scrapy_batch_item.save()
Exemplo n.º 17
0
    def test_priorization(self):
        webdriver = Mock()
        settings = self.settings(WEBDRIVER_BROWSER=webdriver)
        webdriver.get.side_effect = self._wait
        webdriver.page_source = u''

        dispatcher.connect(self._stop_reactor, signal=signals.spider_closed)

        crawler = Crawler(Settings(values=settings))
        crawler.configure()
        spider = self.Spider(name='test', domain='testdomain')
        crawler.crawl(spider)
        crawler.start()
        log.start(loglevel='ERROR')
        reactor.run()

        # I suspect web actions may be broken...
        assert webdriver.get.mock_calls == [
            call('http://testdomain/path?wr=0'),
            call('http://testdomain/path?wr=0&wa=0'),
            call('http://testdomain/path?wr=0&wa=1'),
            call('http://testdomain/path?wr=1'),
            call('http://testdomain/path?wr=1&wa=0'),
            call('http://testdomain/path?wr=1&wa=1'),

            #call('http://testdomain/path?wr=0&wa=0&wr=0'),
            call('http://testdomain/path?wr=0&wa=1&wr=0'),
            call('http://testdomain/path?wr=0&wa=1&wr=0'),

            #call('http://testdomain/path?wr=1&wa=0&wr=0'),
            call('http://testdomain/path?wr=1&wa=1&wr=0'),
            call('http://testdomain/path?wr=1&wa=1&wr=0')
        ]
Exemplo n.º 18
0
def runspider():
	date = datetime.datetime.utcnow()
	unix_date = calendar.timegm(date.utctimetuple())
	
	route = request.args.get('route')
	domain = request.args.get('domain')
	
	directory = r"{0}\initiator\static\scrapes\{1}\{2}".format(os.getcwd(), domain, unix_date)
	
	if not os.path.exists(directory):
		os.makedirs(directory)
	
	logfile = open('testlog.log', 'w')
	log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG)
	log_observer.start()
	log.start(loglevel=logging.DEBUG)
	
	dispatcher.connect(stop_reactor, signal=signals.spider_closed)
	
	spider = MySpider(route, unix_date)
	
	settings_module = importlib.import_module('SiteCrawler.settings')
	settings = CrawlerSettings(settings_module)
	crawler = Crawler(settings)
	
	crawler.configure()
	crawler.crawl(spider)
	crawler.start()
	
	log.msg('Running reactor...')
	reactor.run()  # the script will block here until the spider is closed
	log.msg('Reactor stopped.')
	return redirect(url_for('choose_graph', domain = domain, date = unix_date))
Exemplo n.º 19
0
def setup_crawler(
        spider_class,
        **kwargs
    ):
    """
    Use scrapy in a script
    see http://doc.scrapy.org/en/latest/topics/practices.html

    :param spider_class: Spider class to test
    :type spider_class: text
    """

    def add_item(item):
        items.append(item)

    items = []
    # create Crawler
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    # connect collecting function on item_passed
    crawler.signals.connect(add_item, signals.item_passed)
    # create & connect spider
    spider = spider_class(**kwargs)
    crawler.crawl(spider)
    # start crawler
    log.start()
    crawler.start()
    # run crawler
    task.deferLater(reactor, 1, reactor.stop)
    reactor.run()
    return items
def run_spider(spider, settings):
    from scrapy.crawler import CrawlerProcess
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)
    log.start()
    crawler.start()
def run_spider(spider, settings):
    from scrapy.crawler import CrawlerProcess
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)
    log.start()
    crawler.start()
Exemplo n.º 22
0
def spider_setup():
	spider=Lily_bbs()
	crawler=Crawler(Settings())
	crawler.configure()
	crawler.crawl(spider)
	crawler.start()
	log.start()
	reactor.run()
Exemplo n.º 23
0
 def crawl(self, queue_object):
     ch, method, properties, body = yield queue_object.get()
     if body:
         spider = pickle.load(body)
         t = CrawlerProcess(spider)
         log.start()
         t.setup()
     yield ch.basic_ack(delivery_tag=method.delivery_tag)
Exemplo n.º 24
0
def crawl():
    spider = StackserviceSpider()
    crawler = Crawler(Settings())
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    reactor.run()  # the script will block here
Exemplo n.º 25
0
 def __init__(self):
     log.start('logfile')
     self.conn = sqlite3.connect('russia.db')
     self.c = self.conn.cursor()
     query = ''' CREATE TABLE IF NOT EXISTS kremlin(id INTEGER PRIMARY KEY, title TEXT, 
                 body TEXT, keywords TEXT, post_date DATE, 
                 link TEXT) '''
     self.c.execute(query)
Exemplo n.º 26
0
 def __init__(self,mailer=None):
     super(HemaSpider,self).__init__()
     log.start('d:/3.log', log.WARNING,logstdout=True)
     self.userid_pa = re.compile('uid-(\d+)')
     self.reply_pattern = re.compile('(\d+).*?(\d+)')
     self.post_time_pa = re.compile('</a>.*?(\d+-\d+-\d+).*?(\d+:\d+)',re.S)
     self.mail=mailer
     self.site_id=33
Exemplo n.º 27
0
def crawl():
	spider = StackserviceSpider()
	crawler = Crawler(Settings())
	crawler.configure()
	crawler.crawl(spider)
	crawler.start()
	log.start()
	reactor.run() # the script will block here
Exemplo n.º 28
0
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy.crawler import CrawlerProcess
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)
    log.start()
    crawler.start()
Exemplo n.º 29
0
def crawl(spider_class):
    log.start()
    spider = spider_class()
    crawler = Crawler(CrawlerSettings(scrapy_settings_module))
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    reactor.run()
Exemplo n.º 30
0
 def _crawl( self, queue, search ):
     log.start( loglevel = log.DEBUG )
     current_spider = CraigslistSpider()
     if search:
         current_spider.set_search_url( search )
     self.crawler.crawl( current_spider )
     self.crawler.start()
     self.crawler.stop()
     queue.put( current_spider.get_object_list() )
Exemplo n.º 31
0
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy.crawler import CrawlerProcess
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)
    log.start()
    crawler.start()
Exemplo n.º 32
0
    def __init__(self):
        CrawlSpider.__init__(self)
	log.start(logfile="./log/szlib-%s.log" % strftime("%m%d-%H-%M",localtime(time())), loglevel=log.INFO,logstdout=False)
	log.msg("szlibspider start")
	print "szlibspider start"
        self.verificationErrors = []
        self.selenium = selenium("localhost", 4444, "*firefox /usr/lib/firefox/firefox", "http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html")
	ffdriver = Firefox()
	self.selenium.start(driver=ffdriver)
#	self.selenium.start()


        sel = self.selenium
#        sel.open("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html")
        ffdriver.get("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html")
	WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting page to load")	

        #Wait for javscript to load in Selenium
#        time.sleep(20)
#	sel.wait_for_condition("condition by js", 20000);
#	print "ul/li visible? %s" % sel.is_element_present("//ul[@class='servicepointlist']")

	elements = ffdriver.find_elements_by_xpath("//div[@class='boxtext']/div[@class='filterbox_1']/div[@class='text tab_4_tit district_list']/a")
	num = "wijefowaeofjwejf SSL0011"
	selflibs_num = []
	for district in elements[1:]:
		log.msg("%s selflibs:" % district.text)
		log.msg("==================")
		district.click()
		WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting to load selflibs")	
		selflibs_elements = ffdriver.find_elements_by_xpath("//ul[@class='servicepointlist']/li[@class='item']")
		for selflib_ele in selflibs_elements:
#			num = selflib_ele.find_element_by_class_name("num").text
			num = selflib_ele.find_element_by_class_name("num").get_attribute("textContent")
			log.msg("num %s" % num)
			selflibs_num.append(num[-7:])
			log.msg("numid %s" % num[-7:] )
			log.msg("%s" % selflib_ele.find_element_by_class_name("title").get_attribute("textContent"))
			log.msg("%s" % selflib_ele.find_element_by_class_name("text").get_attribute("textContent"))
			log.msg("---------------")


	log.msg("------1---------")
#	ffdriver.quit()
#	numstr = unicode("编号","utf-8")
#	numstr = unicode(num,"utf-8")
#	log.msg("numstr is in num? %s" % (numstr in num))
#	log.msg("%s,%s, %s" % (num,num[1], num[-7:]))

	for selflibnum in selflibs_num:
		selflib_url ="http://www.szlib.gov.cn/libraryNetwork/dispSelfLibBook/id-5/%s.html" % selflibnum 
		log.msg("selflib url %s" % selflib_url)
	        ffdriver.get(selflib_url)
		WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting to load booklist")	
		categorys_elements = ffdriver.find_elements_by_xpath("//div[@class='boxtext']/div[@class='filterbox_1']/div[@class='text tab_4_tit category']/a")
		for category_ele in categorys_elements[1:]:
			log.msg("%s" % category_ele.text)
Exemplo n.º 33
0
 def runSpider(self):
     dispatcher.connect(SpiderRunner.stop_reactor, signal=signals.spider_closed)
     crawler = self.__crawler
     crawler.crawl(self.__spider)
     crawler.start()
     log.start()
     log.msg("Starting spider...")
     reactor.run()
     log.msg("Stopped spider.")
Exemplo n.º 34
0
def sequentialCrawling():
    config = Configuration()
    spiders = getSpiderObjects(config)
    settings = get_project_settings()
    for spider in spiders:
        args = [spider, settings]
        runSpider(args)
    log.start()
    reactor.run() # the script will block here until the spider_closed signal was sent
Exemplo n.º 35
0
 def __init__(self):
     log.start()
     self.conn = MySQLdb.connect(user=settings.MYSQL_DATABASE['user'], \
         passwd=settings.MYSQL_DATABASE['passwd'], \
         db=settings.MYSQL_DATABASE['db'], \
         host=settings.MYSQL_DATABASE['host'], \
         charset=settings.MYSQL_DATABASE['charset'], \
         use_unicode=settings.MYSQL_DATABASE['use_unicode'])
     self.cursor = self.conn.cursor()
Exemplo n.º 36
0
    def run(self):
        crawler = Crawler(get_project_settings())
        crawler.configure()
        log.start()
        for spiderName in crawler.spiders.list():
            self.spiderCounter += 1
            self.setupCrawler(spiderName)
 
        reactor.run()
Exemplo n.º 37
0
    def run(self):
        crawler = Crawler(get_project_settings())
        crawler.configure()
        log.start()
        for spiderName in crawler.spiders.list():
            self.spiderCounter += 1
            self.setupCrawler(spiderName)

        reactor.run()
Exemplo n.º 38
0
def get_more_entropy():
  spider = TruenetSpider(domain='truenet.co.nz')
  settings = get_project_settings()
  crawler = Crawler(settings)
  crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
  crawler.configure()
  crawler.crawl(spider)
  crawler.start()
  log.start()
  reactor.run()
Exemplo n.º 39
0
def startspider(name):
    spider = tbs1(name)
    crawler = Crawler(get_project_settings())
    crawler.signals.connect(reactor.stop,signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    reactor.run()
    print crawler
 def handle(self, *args, **options):
     spider = HHSearchResultsSpider()
     settings = get_project_settings()
     crawler = Crawler(settings)
     crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
     crawler.configure()
     crawler.crawl(spider)
     crawler.start()
     log.start()
     reactor.run() # the script will block here until the spider_closed signal was sent
Exemplo n.º 41
0
def run_login_spider(seed_url, username, password, db_name, logfile = "results.log"):

    init_db(db_name)
    settings = get_project_settings()
    runner = CrawlerRunner(settings)
    d = runner.crawl(LoginFinderSpider, seed_url = seed_url, username = username, password = password)
    d.addBoth(lambda _: reactor.stop())
    log.start(loglevel=log.DEBUG, logfile=logfile)
    log.msg("Item pipelines enabled: %s" % str(settings.get("ITEM_PIPELINES")), level = log.INFO)
    reactor.run()
Exemplo n.º 42
0
def do_scrape(spider_name):
    """
    Asynchronous task for individual scrapes that is executed by Celery workers.
    :param spider_name: str name of the spider that should be run
    :return: the full path of the jsonlines output file to which results are stored
    """
    # create and configure the spider
    crawl_settings = get_project_settings()
    # configure the output
    # Technically don't need this unless we actually do the scrape, but need to put
    # up here before the crawler is instantiated so the FEED_URI override is active
    output_name = generate_scrape_name(spider_name)
    output_path = os.path.join(crawl_settings.get('DATA_DIR_BASE'), 'scrapes',
                               output_name)
    crawl_settings.overrides['FEED_URI'] = output_path
    crawler = Crawler(crawl_settings)
    crawler.configure()
    try:
        spider = crawler.spiders.create(spider_name)
    except KeyError as e:
        # No spider found.
        raise RuntimeError(
            'Could not find spider with name {}'.format(spider_name))

    # Check to see if we're already running a scrape by looking for open ScrapeJobs
    is_scraping = is_spider_scraping(spider_name)
    if is_scraping is False:
        logger.info('Starting new scrape of {}'.format(spider_name))
        # Create the ScrapeJob record
        job_id = do_scrape.request.id
        if job_id is None:
            # Case if called directly without using Celery, put in a dummy job id
            timestamp = datetime.now().strftime('%y%m%d%H%M')
            job_id = 'MANUAL_RUN{}'.format(timestamp)
        job = ScrapeJob.objects.create(
            spider=spider_name,
            scheduled=datetime.now(),
            # see http://stackoverflow.com/questions/18872854/getting-task-id-inside-a-celery-task
            job_id=job_id,
            raw_response=output_path)
        # and set up the callback for updating it
        complete_cb = complete_job(job.id)

        # Connect the signals and logging, then start it up
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.signals.connect(complete_cb, signal=signals.spider_closed)
        log.start(loglevel=log.INFO, logstdout=True)
        crawler.crawl(spider)
        crawler.start()
        reactor.run()
    else:
        logger.info('Pending job found for spider {}'.format(spider_name))
        job = is_scraping

    return job.raw_response
Exemplo n.º 43
0
def setup_crawler(keyword):
    print 'schedule run script is running.........'
    spider = BaiduSpider(keyword=keyword)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start(loglevel=log.DEBUG)
    reactor.run()
Exemplo n.º 44
0
def setup_crawler(keywords):
    spider = BaiduSpider(keywords=keywords)
    settings = get_project_settings()
    crawler = Crawler(settings)
    # stop reactor when spider closes
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start(loglevel=log.DEBUG)
    reactor.run()
Exemplo n.º 45
0
def run_main():
  log.start()
  InitLog()
  settings = get_project_settings()
  crawler = Crawler(settings)
  spider = JobKeySpider.from_crawler(crawler)
  crawler.signals.connect(reactor.stop, signal = signals.spider_closed)
  crawler.configure()
  crawler.crawl(spider)
  crawler.start(close_if_idle = True)
  reactor.run() # the script will block here until the spider_closed signal was sent
Exemplo n.º 46
0
  def crawla(self):
	#dispatcher.connect(reactor.stop(), signal=signals.spider_closed)
	spider = Titlespider()
	settings = get_project_settings()
	crawler = Crawler(settings)
	crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
	crawler.configure()
	crawler.crawl(spider)
	crawler.start()
	log.start()
	reactor.run()
Exemplo n.º 47
0
def run():
    spider = thSpider(domain='cn-proxy.com')
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    reactor.run(
    )  # the script will block here until the spider_closed signal was sent
Exemplo n.º 48
0
def _cron_kaohsiung():
    dispatcher.connect(stop_reactor, signal=signals.spider_closed)
    spider = KaohsiungSpider()
    crawler = Crawler(Settings())
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    log.msg('Running reactor...')
    reactor.run()  # the script will block here until the spider is closed
    log.msg('Reactor stopped.')
Exemplo n.º 49
0
    def __init__(self, info_log=None):
        if info_log == None:
            raise NotConfigured("CityScrapy类中: 参数info_log不能为空")

        super(CityScrapy, self)
        log.start(logfile=info_log, loglevel=log.INFO, logstdout=False)

        scrapy_item = ScrapyItem()
        scrapy_item.scrapy_name = self.name
        if scrapy_item.is_existed_scrapy_name() is False:
            scrapy_item.save()
Exemplo n.º 50
0
def setup_crawler():
    # spider = FollowAllSpider(domain=domain)
    spider = zackSpider()
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()

    log.start()
    reactor.run()
Exemplo n.º 51
0
def get_connected_devices(ip_address, password, loglevel="WARNING"):
    spider = ConnectedDeviceSpider(ip_address, password)
    collector = ItemCollector()
    crawler = Crawler(Settings())
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.signals.connect(collector.add_item, signals.item_passed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start(loglevel=loglevel, logstdout=False)
    reactor.run()  # the script will block here
    return collector.items
def run_spider(spider, settings):
    from scrapy.crawler import CrawlerProcess
    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()

    # schedule spider
    crawler.crawl(spider)
    
    log.start()
    # start engine scrapy/twisted
    crawler.start()
Exemplo n.º 53
0
def run_spider(origin='', destination='', departure_date='', return_date=''):
    spider = SkyScannerOriginDestinationSpider(
            origin=origin,
            destination=destination,
            departure_date=compact_date(departure_date),
            return_date=compact_date(return_date),
        )
    crawler = Crawler(Settings())
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    reactor.run()
Exemplo n.º 54
0
def main():
    import sys
    sys.path.append("/home/scriptrunner/")
    spider = SexoffSpider(county='ORANGE')
    settings = get_project_settings()
    settings.set('ITEM_PIPELINES', {'scraper.StoreItemsPipeline': 1000})
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    reactor.run()
Exemplo n.º 55
0
    def parse_item(self, response):
        """ This function parses a sample response. Some contracts are mingled
        with this docstring.

        @url http://www.jianshu.com/p/b851e04de659
        @returns items 1 16
        @scrapes  author content title url datetime wordnum views_count
        comments_count likes_count followers_count total_likes_count rank
        """

        item = JianshuItem()
        log.start(logfile='log.txt', loglevel=log.INFO)
        log.msg('RequestURL:%s' % response.url, spider=JSSpider)
        contents = response.xpath('//div[contains(@class, "preview")]')[0]
        item['title'] = contents.xpath(
            'h1[contains(@class,"title")]/text()').extract()[0]
        item['author'] = contents.xpath(
            'div/a[contains(@class,"author-name")]/span/text()').extract()[0]
        item['datetime'] = contents.xpath(
            'div[contains(@class,"author-info")]/span/text()').extract()[1]
        pagecons = response.xpath('//div[contains(@class, "show-content")]/p')
        item['content'] = pagecons.extract()
        item['url'] = response.url
        scriptlists = response.xpath(
            '//script[contains(@data-name,"note")]/text()').extract()
        scriptlist6 = scriptlists[0].strip().split(',')[-6:]
        newscripts = []
        for script in scriptlist6:
            newscripts += script.encode('utf8').split(':')
        newscript = [n.replace('"', '') for n in newscripts]
        newdict = dict(newscript[i:i + 2] for i in range(0, len(newscript), 2))
        item['wordnum'] = newdict.get('wordage')
        item['views_count'] = newdict.get('views_count')
        item['likes_count'] = newdict.get('likes_count')
        item['comments_count'] = newdict.get('comments_count')
        followersandtotallikes = response.xpath(
            '//script[contains(@data-name,"author")]/text()').extract()
        followersandtotallikes2 = followersandtotallikes[0].strip().split(
            ',')[-3:-1]
        newfollowersandtotallikes2 = []
        for followersandlikes in followersandtotallikes2:
            newfollowersandtotallikes2 += followersandlikes.encode(
                'utf8').split(':')
        followerslikes = [
            n.replace('"', '') for n in newfollowersandtotallikes2
        ]
        followerslikesdict = dict(followerslikes[i:i + 2]
                                  for i in range(0, len(followerslikes), 2))
        item['followers_count'] = followerslikesdict.get('followers_count')
        item['total_likes_count'] = followerslikesdict.get('total_likes_count')
        return item
Exemplo n.º 56
0
def sitecrawl(request):
    if 'q' in request.GET:
        q = request.GET['q']

        spider = testspider(domain='q')
        crawler = Crawler(Settings())
        crawler.configure()
        crawler.crawl(spider)
        crawler.start()
        log.start()
        reactor.run()
        return render(request, 'sitescrawl.html')
    else:
        return render(request, 'sitescrawl.html')
Exemplo n.º 57
0
def crawl_resident_advisor():

    global spider_count
    spider_count = 0

    crawler = Crawler(Settings())
    crawler.configure()
    crawler.crawl(linkedin())
    crawler.start()

    log.start()
    log.msg('Running in reactor...')
    reactor.run()  # the script will block here
    log.msg('Reactor stopped.')
Exemplo n.º 58
0
	def run(self):
		dispatcher.connect(self.stop_reactor, signal=signals.spider_closed)
		spider = PriceSpider(self.str)
		testset = Settings()
		testset.set("ITEM_PIPELINES",{
		    'tutorial.pipelines.MySQLStorePipeline': 1
		})
		crawler = Crawler(testset)
		crawler.configure()
		crawler.crawl(spider)
		crawler.start()
		log.start()
		log.msg('Running reactor...')
		reactor.run(installSignalHandlers=0)  # the script will block here until the spider is closed
		log.msg('Reactor stopped.')