Exemplo n.º 1
0
 def get_words(self, url):
     html = GetWords.get_content(url)
     try:
         words = self.catch_words(html)
         wlist = []
         for wd in words:
             wlist.extend(self.analyze(wd))
         PyMongoUtil.write(url, wlist)
     except Exception, e:
         logger.error(url + " " + str(e))
Exemplo n.º 2
0
    def testGetUrl():
        PyMongoUtil.clean()
        MemcacheUtil.clean()
        SpiderBloomFilter()

        html = GetWords.get_content("http://www.leakedin.com/tag/emailpassword-dump/")
        list = UrlScan.scanpage(html,"http://www.leakedin.com/tag/emailpassword-dump/",None)

        for l in list:
            PyMongoUtil.write(l,[""])
        print len(list)
Exemplo n.º 3
0
    def testGetUrl():
        PyMongoUtil.clean()
        MemcacheUtil.clean()
        SpiderBloomFilter()

        html = GetWords.get_content(
            "http://www.leakedin.com/tag/emailpassword-dump/")
        list = UrlScan.scanpage(
            html, "http://www.leakedin.com/tag/emailpassword-dump/", None)

        for l in list:
            PyMongoUtil.write(l, [""])
        print len(list)
Exemplo n.º 4
0
def trytry():
    PyMongoUtil.clean()
    MemcacheUtil.clean()
    SpiderBloomFilter()
    queue = PyPool.get_queue()
    lock = PyPool.get_lock()
    listener = MyListener()
    r = Regex("[a-z0-9\-\._]+@[a-z0-9\-\.]+\.[a-z]{2,4}[:,\|]*.*")
    s = SpiderStrategy("http://www.leakedin.com/tag/emailpassword-dump/", 2, is_out=False, pattern=None, mode=r)
    Spider(s).get_all_words(queue, lock)
    listener.listen(lock, queue)
    WordCount.calc_count()
    return
Exemplo n.º 5
0
def trytry():
    PyMongoUtil.clean()
    MemcacheUtil.clean()
    SpiderBloomFilter()
    queue = PyPool.get_queue()
    lock = PyPool.get_lock()
    listener = MyListener()
    r = Regex("[a-z0-9\-\._]+@[a-z0-9\-\.]+\.[a-z]{2,4}[:,\|]*.*")
    s = SpiderStrategy("http://www.leakedin.com/tag/emailpassword-dump/",
                       2,
                       is_out=False,
                       pattern=None,
                       mode=r)
    Spider(s).get_all_words(queue, lock)
    listener.listen(lock, queue)
    WordCount.calc_count()
    return
Exemplo n.º 6
0
from QueueListener.listener import MyListener
from SpiderUtils.bloomFilter import SpiderBloomFilter
from SpiderUtils.spider import Spider
from SpiderUtils.spiderStrategy import SpiderStrategy
from Statics.wordCount import WordCount
from Utils.logFactory import LogFactory
from SpiderUtils.SpiderMode.regexMode import Regex
from SpiderUtils.enums import Language
from SpiderUtils.getWords import GetWords
from PyIO.excelUtil import ExcelUtil
from os import path

logger = LogFactory.getlogger("main")

# clean old data
PyMongoUtil.clean()
MemcacheUtil.clean()

# create bloom filter
SpiderBloomFilter()

# multitask prepare
queue = PyPool.get_queue()
lock = PyPool.get_lock()
listener = MyListener()


def err():
    print("please enter the right select")