def get_words(self, url): html = GetWords.get_content(url) try: words = self.catch_words(html) wlist = [] for wd in words: wlist.extend(self.analyze(wd)) PyMongoUtil.write(url, wlist) except Exception, e: logger.error(url + " " + str(e))
def testGetUrl(): PyMongoUtil.clean() MemcacheUtil.clean() SpiderBloomFilter() html = GetWords.get_content("http://www.leakedin.com/tag/emailpassword-dump/") list = UrlScan.scanpage(html,"http://www.leakedin.com/tag/emailpassword-dump/",None) for l in list: PyMongoUtil.write(l,[""]) print len(list)
def testGetUrl(): PyMongoUtil.clean() MemcacheUtil.clean() SpiderBloomFilter() html = GetWords.get_content( "http://www.leakedin.com/tag/emailpassword-dump/") list = UrlScan.scanpage( html, "http://www.leakedin.com/tag/emailpassword-dump/", None) for l in list: PyMongoUtil.write(l, [""]) print len(list)
def trytry(): PyMongoUtil.clean() MemcacheUtil.clean() SpiderBloomFilter() queue = PyPool.get_queue() lock = PyPool.get_lock() listener = MyListener() r = Regex("[a-z0-9\-\._]+@[a-z0-9\-\.]+\.[a-z]{2,4}[:,\|]*.*") s = SpiderStrategy("http://www.leakedin.com/tag/emailpassword-dump/", 2, is_out=False, pattern=None, mode=r) Spider(s).get_all_words(queue, lock) listener.listen(lock, queue) WordCount.calc_count() return
from QueueListener.listener import MyListener from SpiderUtils.bloomFilter import SpiderBloomFilter from SpiderUtils.spider import Spider from SpiderUtils.spiderStrategy import SpiderStrategy from Statics.wordCount import WordCount from Utils.logFactory import LogFactory from SpiderUtils.SpiderMode.regexMode import Regex from SpiderUtils.enums import Language from SpiderUtils.getWords import GetWords from PyIO.excelUtil import ExcelUtil from os import path logger = LogFactory.getlogger("main") # clean old data PyMongoUtil.clean() MemcacheUtil.clean() # create bloom filter SpiderBloomFilter() # multitask prepare queue = PyPool.get_queue() lock = PyPool.get_lock() listener = MyListener() def err(): print("please enter the right select")