def _do(self): cnt = MemcacheUtil.get(const.PROCESSCNTKEY) if cnt is None: cnt = 1 else: cnt += 1 MemcacheUtil.set(const.PROCESSCNTKEY, cnt) logger.debug("process cnt:" + str(cnt)) return True
def _do(self): cnt = MemcacheUtil.get(const.PROCESSCNTKEY) if cnt is None: cnt = 1 else: cnt += 1 MemcacheUtil.set(const.PROCESSCNTKEY, cnt) logger.debug("process cnt:"+str(cnt)) return True
def testGetUrl(): PyMongoUtil.clean() MemcacheUtil.clean() SpiderBloomFilter() html = GetWords.get_content("http://www.leakedin.com/tag/emailpassword-dump/") list = UrlScan.scanpage(html,"http://www.leakedin.com/tag/emailpassword-dump/",None) for l in list: PyMongoUtil.write(l,[""]) print len(list)
def testGetUrl(): PyMongoUtil.clean() MemcacheUtil.clean() SpiderBloomFilter() html = GetWords.get_content( "http://www.leakedin.com/tag/emailpassword-dump/") list = UrlScan.scanpage( html, "http://www.leakedin.com/tag/emailpassword-dump/", None) for l in list: PyMongoUtil.write(l, [""]) print len(list)
def trytry(): PyMongoUtil.clean() MemcacheUtil.clean() SpiderBloomFilter() queue = PyPool.get_queue() lock = PyPool.get_lock() listener = MyListener() r = Regex("[a-z0-9\-\._]+@[a-z0-9\-\.]+\.[a-z]{2,4}[:,\|]*.*") s = SpiderStrategy("http://www.leakedin.com/tag/emailpassword-dump/", 2, is_out=False, pattern=None, mode=r) Spider(s).get_all_words(queue, lock) listener.listen(lock, queue) WordCount.calc_count() return
from SpiderUtils.bloomFilter import SpiderBloomFilter from SpiderUtils.spider import Spider from SpiderUtils.spiderStrategy import SpiderStrategy from Statics.wordCount import WordCount from Utils.logFactory import LogFactory from SpiderUtils.SpiderMode.regexMode import Regex from SpiderUtils.enums import Language from SpiderUtils.getWords import GetWords from PyIO.excelUtil import ExcelUtil from os import path logger = LogFactory.getlogger("main") # clean old data PyMongoUtil.clean() MemcacheUtil.clean() # create bloom filter SpiderBloomFilter() # multitask prepare queue = PyPool.get_queue() lock = PyPool.get_lock() listener = MyListener() def err(): print("please enter the right select") while True:
try: lock.acquire() size = queue.qsize() size = PyPool.limit if size > PyPool.limit else size for num in range(0, size): strategy = queue.get_nowait() ProcessCntIncrease().lock_and_do() self.__pool.apply_async(apply_spider, (strategy, queue, lock)) except Exception, e: logger.error(e) finally: lock.release() try: lock.acquire() length = MemcacheUtil.get(const.PROCESSCNTKEY) size = queue.qsize() if size == 0 and (length is None or length == 0): loop_flag = False else: time.sleep(3) except Exception, e: logger.error(str(e)) finally: lock.release() self.__pool.close() logger.info("start to wait for all processes") self.__pool.join() return
lock.acquire() size = queue.qsize() size = PyPool.limit if size > PyPool.limit else size for num in range(0, size): strategy = queue.get_nowait() ProcessCntIncrease().lock_and_do() self.__pool.apply_async(apply_spider, (strategy, queue, lock)) except Exception, e: logger.error(e) finally: lock.release() try: lock.acquire() length = MemcacheUtil.get(const.PROCESSCNTKEY) size = queue.qsize() if size == 0 and (length is None or length == 0): loop_flag = False else: time.sleep(3) except Exception, e: logger.error(str(e)) finally: lock.release() self.__pool.close() logger.info("start to wait for all processes") self.__pool.join() return