示例#1
0
class WordCount:
    logger = LogFactory.getlogger("WordCount")

    mapper = Code("""
        function() {
            emit(this.content, 1);
        }
    """)

    reducer = Code("""
        function(key, values) {
            var sum = 0;
            values.forEach(function(value) {
                sum += Number(value);
            });
            return sum;
        };
    """)

    @staticmethod
    def calc_count():
        WordCount.logger.info("start to count words")
        ip = Config.getProperty('mongo', 'addr')
        port = int(Config.getProperty('mongo', 'port'))
        client = pymongo.MongoClient(ip, port)
        db = client.spiderDB
        collection = db.spider
        collection.map_reduce(WordCount.mapper, WordCount.reducer, out="result", full_response=True)
示例#2
0
文件: spider.py 项目: zzmzz/SpiderZ
# !/usr/bin/python
# coding=utf-8
from urlparse import *

from SpiderUtils.getWords import GetWords
from Utils.logFactory import LogFactory
from enums import Language
from getUrls import UrlScan
from spiderStrategy import SpiderStrategy
from SpiderUtils.modeFactory import ModeFactory

logger = LogFactory.getlogger("Spider")


class Spider:
    __isout = False
    __url = ""
    __depth = 1
    __url_pattern = None
    __mode = None

    def __init__(self, strategy=SpiderStrategy()):
        self.__isout = strategy.is_out
        self.__url = strategy.url
        self.__depth = strategy.depth
        self.__mode = strategy.mode
        pattern = strategy.pattern
        if pattern is None:
            if strategy.is_out is False:
                r = urlparse(strategy.url)
                self.__url_pattern = r.netloc
示例#3
0
from abstractMode import AbstractMode
from Utils.logFactory import LogFactory
from SpiderUtils.getWords import GetWords
from WordSplit.splitAdapter import SplitAdapter

logger = LogFactory.getlogger("ChineseMode")


class ChineseMode(AbstractMode):
    def __init__(self):
        super(ChineseMode, self).__init__()

    def catch_words(self, html):
        words = GetWords.get_chinese(html)
        return words

    def analyze(self, word):
        ws = SplitAdapter.split(word)
        return ws
示例#4
0
import time
from abc import ABCMeta, abstractmethod

from Utils.logFactory import LogFactory
from memcacheUtil import MemcacheUtil

logger = LogFactory.getlogger("LockModel")


class LockModel:
    __metaclass__ = ABCMeta

    __lock_key = "WRITEKEY"

    def __init__(self, key):
        self.__lock_key = key

    def lock_and_do(self):
        is_loop = True
        while is_loop:
            # print "loop",self.__lock_key
            if MemcacheUtil.get(self.__lock_key) is None:
                if MemcacheUtil.add(self.__lock_key, True):
                    # logger.debug("get memcache lock: " + self.__lock_key)
                    result = None
                    try:
                        result = self._do()
                    except Exception as e:
                        logger.error(e)
                    finally:
                        MemcacheUtil.delete(self.__lock_key)
示例#5
0
from PyMemcached.lockModel import LockModel
from Consts.cacheKeyConstants import const
from PyMemcached.memcacheUtil import MemcacheUtil
from Utils.logFactory import LogFactory

logger = LogFactory.getlogger("ProcessCnt")


class ProcessCntIncrease(LockModel):
    def __init__(self):
        super(ProcessCntIncrease, self).__init__(const.PROCESSWRITEKEY)

    def _do(self):
        cnt = MemcacheUtil.get(const.PROCESSCNTKEY)
        if cnt is None:
            cnt = 1
        else:
            cnt += 1
        MemcacheUtil.set(const.PROCESSCNTKEY, cnt)
        logger.debug("process cnt:" + str(cnt))
        return True


class ProcessCntReduce(LockModel):
    def __init__(self):
        super(ProcessCntReduce, self).__init__(const.PROCESSWRITEKEY)

    def _do(self):
        cnt = MemcacheUtil.get(const.PROCESSCNTKEY)
        cnt -= 1
        MemcacheUtil.set(const.PROCESSCNTKEY, cnt)
示例#6
0
from ProcessPool.pool import PyPool
from PyIO.pyMongoUtil import PyMongoUtil
from PyMemcached.memcacheUtil import MemcacheUtil
from QueueListener.listener import MyListener
from SpiderUtils.bloomFilter import SpiderBloomFilter
from SpiderUtils.spider import Spider
from SpiderUtils.spiderStrategy import SpiderStrategy
from Statics.wordCount import WordCount
from Utils.logFactory import LogFactory
from SpiderUtils.SpiderMode.regexMode import Regex
from SpiderUtils.enums import Language
from SpiderUtils.getWords import GetWords
from PyIO.excelUtil import ExcelUtil
from os import path

logger = LogFactory.getlogger("main")

# clean old data
PyMongoUtil.clean()
MemcacheUtil.clean()

# create bloom filter
SpiderBloomFilter()

# multitask prepare
queue = PyPool.get_queue()
lock = PyPool.get_lock()
listener = MyListener()


def err():
示例#7
0
from abstractMode import AbstractMode
from Utils.logFactory import LogFactory
from SpiderUtils.getWords import GetWords
import re
from Utils.htmlUtil import HtmlUtil

logger = LogFactory.getlogger("EnglishMode")


class EnglishMode(AbstractMode):
    def __init__(self):
        super(EnglishMode, self).__init__()

    def catch_words(self, html):
        raw = HtmlUtil.filter_tags(html)
        words = GetWords.get_english(raw)
        return words

    def analyze(self, word):
        m = re.search("\d+", word)
        n = re.search("\W+", word)
        if not m and not n and len(word) > 4:
            return [word]
        else:
            return []
示例#8
0
文件: listener.py 项目: zzmzz/SpiderZ
import time

from ProcessPool.pool import PyPool
from SpiderUtils.spider import Spider
from Utils.logFactory import LogFactory
from PyMemcached.Locks.processCntLock import ProcessCntReduce, ProcessCntIncrease
from PyMemcached.memcacheUtil import MemcacheUtil
from Consts.cacheKeyConstants import const
from Utils.logFactory import LogFactory

logger = LogFactory.getlogger("MyListener")


class MyListener:
    __lock = None
    __queue = None
    __wait_cnt = 10

    def __init__(self):
        self.__pool = PyPool.get_pool()

    def listen(self, lock, queue):
        loop_flag = True
        while loop_flag:
            try:
                lock.acquire()
                size = queue.qsize()
                size = PyPool.limit if size > PyPool.limit else size
                for num in range(0, size):
                    strategy = queue.get_nowait()
                    ProcessCntIncrease().lock_and_do()
示例#9
0
文件: test.py 项目: zzmzz/SpiderZ
from Consts.cacheKeyConstants import const
from ProcessPool.pool import PyPool
from PyIO.pyMongoUtil import PyMongoUtil
from PyMemcached.memcacheUtil import MemcacheUtil
from QueueListener.listener import MyListener
from SpiderUtils.bloomFilter import SpiderBloomFilter
from SpiderUtils.enums import Language
from SpiderUtils.spider import Spider
from SpiderUtils.spiderStrategy import SpiderStrategy
from Statics.wordCount import WordCount
from Utils.logFactory import LogFactory
from SpiderUtils.SpiderMode.regexMode import Regex
from SpiderUtils.getUrls import UrlScan
from SpiderUtils.getWords import GetWords
import urllib, htmllib, formatter
logger = LogFactory.getlogger("test")
import re
from bs4 import BeautifulSoup
from PyIO.pyMongoUtil import PyMongoUtil


class Test:
    @staticmethod
    def testGetUrl():
        PyMongoUtil.clean()
        MemcacheUtil.clean()
        SpiderBloomFilter()

        html = GetWords.get_content(
            "http://www.leakedin.com/tag/emailpassword-dump/")
        list = UrlScan.scanpage(
示例#10
0
from PyMemcached.lockModel import LockModel
from Consts.cacheKeyConstants import const
from PyMemcached.memcacheUtil import MemcacheUtil
from Utils.logFactory import LogFactory
from SpiderUtils.bloomFilter import SpiderBloomFilter

logger = LogFactory.getlogger("BloomFilterLock")


class BloomFilterLock(LockModel):
    __url = None

    def __init__(self, url):
        self.__url = url
        super(BloomFilterLock, self).__init__(const.URLWRITEKEY)

    def _do(self):
        if SpiderBloomFilter.exists(self.__url):
            logger.debug("dup url: " + self.__url)
            return False
        else:
            logger.debug("access url: " + self.__url)
            return True
示例#11
0
文件: test.py 项目: gandaruvu/SpiderZ
from Consts.cacheKeyConstants import const
from ProcessPool.pool import PyPool
from PyIO.pyMongoUtil import PyMongoUtil
from PyMemcached.memcacheUtil import MemcacheUtil
from QueueListener.listener import MyListener
from SpiderUtils.bloomFilter import SpiderBloomFilter
from SpiderUtils.enums import Language
from SpiderUtils.spider import Spider
from SpiderUtils.spiderStrategy import SpiderStrategy
from Statics.wordCount import WordCount
from Utils.logFactory import LogFactory
from SpiderUtils.SpiderMode.regexMode import Regex
from SpiderUtils.getUrls import UrlScan
from SpiderUtils.getWords import GetWords
import urllib, htmllib, formatter
logger = LogFactory.getlogger("test")
import re
from bs4 import BeautifulSoup
from PyIO.pyMongoUtil import PyMongoUtil

class Test:
    @staticmethod
    def testGetUrl():
        PyMongoUtil.clean()
        MemcacheUtil.clean()
        SpiderBloomFilter()

        html = GetWords.get_content("http://www.leakedin.com/tag/emailpassword-dump/")
        list = UrlScan.scanpage(html,"http://www.leakedin.com/tag/emailpassword-dump/",None)

        for l in list:
示例#12
0
#!/usr/bin/python
# coding=utf-8
from abc import ABCMeta, abstractmethod
import sys
from SpiderUtils.getWords import GetWords
from Utils.logFactory import LogFactory
from PyIO.pyMongoUtil import PyMongoUtil

reload(sys)
sys.setdefaultencoding('utf-8')

logger = LogFactory.getlogger("Mode")


class AbstractMode:
    __metaclass__ = ABCMeta

    def __init__(self):
        pass

    def get_words(self, url):
        html = GetWords.get_content(url)
        try:
            words = self.catch_words(html)
            wlist = []
            for wd in words:
                wlist.extend(self.analyze(wd))
            PyMongoUtil.write(url, wlist)
        except Exception, e:
            logger.error(url + " " + str(e))
        return html
示例#13
0
import time

from ProcessPool.pool import PyPool
from SpiderUtils.spider import Spider
from Utils.logFactory import LogFactory
from PyMemcached.Locks.processCntLock import ProcessCntReduce, ProcessCntIncrease
from PyMemcached.memcacheUtil import MemcacheUtil
from Consts.cacheKeyConstants import const
from Utils.logFactory import LogFactory

logger = LogFactory.getlogger("MyListener")


class MyListener:
    __lock = None
    __queue = None
    __wait_cnt = 10

    def __init__(self):
        self.__pool = PyPool.get_pool()

    def listen(self, lock, queue):
        loop_flag = True
        while loop_flag:
            try:
                lock.acquire()
                size = queue.qsize()
                size = PyPool.limit if size > PyPool.limit else size
                for num in range(0, size):
                    strategy = queue.get_nowait()
                    ProcessCntIncrease().lock_and_do()
示例#14
0
from PyMemcached.lockModel import LockModel
from Consts.cacheKeyConstants import const
from PyMemcached.memcacheUtil import MemcacheUtil
from Utils.logFactory import LogFactory

logger = LogFactory.getlogger("ProcessCnt")

class ProcessCntIncrease(LockModel):

    def __init__(self):
        super(ProcessCntIncrease, self).__init__(const.PROCESSWRITEKEY)

    def _do(self):
        cnt = MemcacheUtil.get(const.PROCESSCNTKEY)
        if cnt is None:
            cnt = 1
        else:
            cnt += 1
        MemcacheUtil.set(const.PROCESSCNTKEY, cnt)
        logger.debug("process cnt:"+str(cnt))
        return True


class ProcessCntReduce(LockModel):

    def __init__(self):
        super(ProcessCntReduce, self).__init__(const.PROCESSWRITEKEY)

    def _do(self):
        cnt = MemcacheUtil.get(const.PROCESSCNTKEY)
        cnt -= 1
示例#15
0
#!/usr/bin/python
# coding=utf-8
from bs4 import BeautifulSoup
from PyMemcached.Locks.bloomFilterLock import BloomFilterLock
from Utils.logFactory import LogFactory
import htmllib, formatter, re

logger = LogFactory.getlogger("UrlScan")


class UrlScan:
    @staticmethod
    def scanpage(html, url, pattern=None):
        try:
            BloomFilterLock(url).lock_and_do()
            results = []
            format = formatter.AbstractFormatter(formatter.NullWriter())
            ptext = htmllib.HTMLParser(format)
            ptext.feed(html)
            for link in ptext.anchorlist:
                if pattern is None:
                    pattern = 'http'
                r = re.findall(pattern, link)
                if r is None or len(r) == 0:
                    continue
                if BloomFilterLock(link).lock_and_do():
                    results.append(link)
            return results
        except Exception, e:
            logger.error("catch urls exception url: " + url + " error: " + str(e))
示例#16
0
文件: getWords.py 项目: zzmzz/SpiderZ
#!/usr/bin/python
# coding=utf-8
import re
import sys
import chardet
from Utils.logFactory import LogFactory
import urllib2
import zlib

reload(sys)
sys.setdefaultencoding('utf-8')

logger = LogFactory.getlogger("GetWords")


class GetWords:
    headers = headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'}

    @staticmethod
    def get_chinese(html):
        raw = GetWords.__get_unicode_content(html)
        words = re.findall(ur"[\u4e00-\u9fa5]+", raw)
        if words.__len__() == 0:
            raise Exception, "cannot find any words"
        return words

    @staticmethod
    def get_english(html):
        s = re.findall("\w+", str.lower(html))
        return s
示例#17
0
from abstractMode import AbstractMode
from Utils.logFactory import LogFactory
from SpiderUtils.getWords import GetWords
from WordSplit.splitAdapter import SplitAdapter

logger = LogFactory.getlogger("KoreanMode")


class KoreanMode(AbstractMode):
    def __init__(self):
        super(KoreanMode, self).__init__()

    def catch_words(self, html):
        words = GetWords.get_korean(html)
        return words

    def analyze(self, word):
        return [word]