Пример #1
0
 def _do(self):
     cnt = MemcacheUtil.get(const.PROCESSCNTKEY)
     if cnt is None:
         cnt = 1
     else:
         cnt += 1
     MemcacheUtil.set(const.PROCESSCNTKEY, cnt)
     logger.debug("process cnt:" + str(cnt))
     return True
Пример #2
0
 def _do(self):
     cnt = MemcacheUtil.get(const.PROCESSCNTKEY)
     if cnt is None:
         cnt = 1
     else:
         cnt += 1
     MemcacheUtil.set(const.PROCESSCNTKEY, cnt)
     logger.debug("process cnt:"+str(cnt))
     return True
Пример #3
0
    def testGetUrl():
        PyMongoUtil.clean()
        MemcacheUtil.clean()
        SpiderBloomFilter()

        html = GetWords.get_content("http://www.leakedin.com/tag/emailpassword-dump/")
        list = UrlScan.scanpage(html,"http://www.leakedin.com/tag/emailpassword-dump/",None)

        for l in list:
            PyMongoUtil.write(l,[""])
        print len(list)
Пример #4
0
    def testGetUrl():
        PyMongoUtil.clean()
        MemcacheUtil.clean()
        SpiderBloomFilter()

        html = GetWords.get_content(
            "http://www.leakedin.com/tag/emailpassword-dump/")
        list = UrlScan.scanpage(
            html, "http://www.leakedin.com/tag/emailpassword-dump/", None)

        for l in list:
            PyMongoUtil.write(l, [""])
        print len(list)
Пример #5
0
def trytry():
    PyMongoUtil.clean()
    MemcacheUtil.clean()
    SpiderBloomFilter()
    queue = PyPool.get_queue()
    lock = PyPool.get_lock()
    listener = MyListener()
    r = Regex("[a-z0-9\-\._]+@[a-z0-9\-\.]+\.[a-z]{2,4}[:,\|]*.*")
    s = SpiderStrategy("http://www.leakedin.com/tag/emailpassword-dump/", 2, is_out=False, pattern=None, mode=r)
    Spider(s).get_all_words(queue, lock)
    listener.listen(lock, queue)
    WordCount.calc_count()
    return
Пример #6
0
def trytry():
    PyMongoUtil.clean()
    MemcacheUtil.clean()
    SpiderBloomFilter()
    queue = PyPool.get_queue()
    lock = PyPool.get_lock()
    listener = MyListener()
    r = Regex("[a-z0-9\-\._]+@[a-z0-9\-\.]+\.[a-z]{2,4}[:,\|]*.*")
    s = SpiderStrategy("http://www.leakedin.com/tag/emailpassword-dump/",
                       2,
                       is_out=False,
                       pattern=None,
                       mode=r)
    Spider(s).get_all_words(queue, lock)
    listener.listen(lock, queue)
    WordCount.calc_count()
    return
Пример #7
0
from SpiderUtils.bloomFilter import SpiderBloomFilter
from SpiderUtils.spider import Spider
from SpiderUtils.spiderStrategy import SpiderStrategy
from Statics.wordCount import WordCount
from Utils.logFactory import LogFactory
from SpiderUtils.SpiderMode.regexMode import Regex
from SpiderUtils.enums import Language
from SpiderUtils.getWords import GetWords
from PyIO.excelUtil import ExcelUtil
from os import path

logger = LogFactory.getlogger("main")

# clean old data
PyMongoUtil.clean()
MemcacheUtil.clean()

# create bloom filter
SpiderBloomFilter()

# multitask prepare
queue = PyPool.get_queue()
lock = PyPool.get_lock()
listener = MyListener()


def err():
    print("please enter the right select")


while True:
Пример #8
0
            try:
                lock.acquire()
                size = queue.qsize()
                size = PyPool.limit if size > PyPool.limit else size
                for num in range(0, size):
                    strategy = queue.get_nowait()
                    ProcessCntIncrease().lock_and_do()
                    self.__pool.apply_async(apply_spider, (strategy, queue, lock))
            except Exception, e:
                logger.error(e)
            finally:
                lock.release()

            try:
                lock.acquire()
                length = MemcacheUtil.get(const.PROCESSCNTKEY)
                size = queue.qsize()
                if size == 0 and (length is None or length == 0):
                    loop_flag = False
                else:
                    time.sleep(3)
            except Exception, e:
                logger.error(str(e))
            finally:
                lock.release()

        self.__pool.close()
        logger.info("start to wait for all processes")
        self.__pool.join()
        return
Пример #9
0
                lock.acquire()
                size = queue.qsize()
                size = PyPool.limit if size > PyPool.limit else size
                for num in range(0, size):
                    strategy = queue.get_nowait()
                    ProcessCntIncrease().lock_and_do()
                    self.__pool.apply_async(apply_spider,
                                            (strategy, queue, lock))
            except Exception, e:
                logger.error(e)
            finally:
                lock.release()

            try:
                lock.acquire()
                length = MemcacheUtil.get(const.PROCESSCNTKEY)
                size = queue.qsize()
                if size == 0 and (length is None or length == 0):
                    loop_flag = False
                else:
                    time.sleep(3)
            except Exception, e:
                logger.error(str(e))
            finally:
                lock.release()

        self.__pool.close()
        logger.info("start to wait for all processes")
        self.__pool.join()
        return