Exemplo n.º 1
0
    def __init__(self, pool, maxHostID, monitorInterval=2):
        self._messageTypes = {}
        # Save arguments
        self._stop = False
        self._stopped = False
        self._poolID = str(pool.spUUID)
        self._spmStorageDir = pool.storage_repository
        tpSize = config.getint('irs', 'thread_pool_size') / 2
        waitTimeout = 3
        maxTasks = config.getint('irs', 'max_tasks')
        self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
        #  *** IMPORTANT NOTE: The SPM's inbox is the HSMs' outbox and vice
        #                      versa *** #
        self._inbox = os.path.join(self._spmStorageDir, self._poolID,
                                   "mastersd", sd.DOMAIN_META_DATA, "inbox")
        if not os.path.exists(self._inbox):
            self.log.error("SPM_MailMonitor create failed - inbox %s does not "
                           "exist" % repr(self._inbox))
            raise RuntimeError("SPM_MailMonitor create failed - inbox %s does "
                               "not exist" % repr(self._inbox))
        self._outbox = os.path.join(self._spmStorageDir, self._poolID,
                                    "mastersd", sd.DOMAIN_META_DATA, "outbox")
        if not os.path.exists(self._outbox):
            self.log.error("SPM_MailMonitor create failed - outbox %s does "
                           "not exist" % repr(self._outbox))
            raise RuntimeError("SPM_MailMonitor create failed - outbox %s "
                               "does not exist" % repr(self._outbox))
        self._numHosts = int(maxHostID)
        self._outMailLen = MAILBOX_SIZE * self._numHosts
        self._monitorInterval = monitorInterval
        # TODO: add support for multiple paths (multiple mailboxes)
        self._outgoingMail = self._outMailLen * "\0"
        self._incomingMail = self._outgoingMail
        self._inCmd = ['dd',
                       'if=' + str(self._inbox),
                       'iflag=direct,fullblock',
                       'count=1'
                       ]
        self._outCmd = ['dd',
                        'of=' + str(self._outbox),
                        'oflag=direct',
                        'iflag=fullblock',
                        'conv=notrunc',
                        'count=1'
                        ]
        self._outLock = threading.Lock()
        self._inLock = threading.Lock()
        # Clear outgoing mail
        self.log.debug("SPM_MailMonitor - clearing outgoing mail, command is: "
                       "%s", self._outCmd)
        cmd = self._outCmd + ['bs=' + str(self._outMailLen)]
        (rc, out, err) = _mboxExecCmd(cmd, data=self._outgoingMail)
        if rc:
            self.log.warning("SPM_MailMonitor couldn't clear outgoing mail, "
                             "dd failed")

        t = concurrent.thread(self.run, name="mailbox.SPMMonitor",
                              logger=self.log.name)
        t.start()
        self.log.debug('SPM_MailMonitor created for pool %s' % self._poolID)
Exemplo n.º 2
0
    def __init__(self, args=Strategy()):
        self.url = args.url
        self.max_depth = args.max_depth  #指定网页深度
        self.max_count = args.max_count  #爬行最大数量
        self.concurrency = args.concurrency  #线程数
        self.timeout = args.timeout  #超时
        self.cookies = args.cookies  #cookies
        self.ssl_verify = args.ssl_verify  #ssl
        self.same_host = args.same_host  #是否只抓取相同host的链接
        self.same_domain = args.same_domain  #是否只抓取相同domain的链接

        self.currentDepth = 1  #标注初始爬虫深度,从1开始
        self.keyword = args.keyword  #指定关键词,使用console的默认编码来解码

        self.threadPool = ThreadPool(args.concurrency)  #线程池,指定线程数

        self.visitedHrefs = set()  #已访问的链接
        self.unvisitedHrefs = deque()  #待访问的链接
        self.unvisitedHrefs.append(args.url)  #添加首个待访问的链接
        self.isCrawling = False  #标记爬虫是否开始执行任务

        self.file = BASEDIR + '/cache/crawler/' + genFilename(
            self.url) + '.txt'
        # print self.file
        # print 'args.url=\t',args.url

        #################
        #此句有问题
        self.database = Database(args.dbFile)  #数据库
        # print 'hehe'

        self.lock = Lock()
Exemplo n.º 3
0
    def saveProxies(self):
        #创建线程30个,并开启线程
        threadPool = ThreadPool(30)
        threadPool.startThreads()

        #调用类 读取数据
        #databases = database.DatabaseProxyIp()

        proxyip = self.proxyip_db.readData()
        #x循环读取数据进行匹配
        for proxy in proxyip:
            #把测试函数放入线程中
            threadPool.putTask(self.checkclientUrl, proxy[0])
            #threadPool.putTask(self.checkProxy, proxy[0])
            #flag,proxy = checkProxy(proxy[0])
        #循环获取测试结果,成功写入数据库,失败修改available为0
        ip_fail = 0
        ip_ok = 0
        ip_lock = 0
        while threadPool.getTaskLeft():
            flag, proxy = threadPool.getTaskResult()
            print flag, proxy
            if flag == 'ok':
                #print 'ok ', proxy
                self.proxyip_db.updateData(1, proxy)
                ip_ok = ip_ok + 1
            elif flag == 'lock':
                self.proxyip_db.updateData(0, proxy)
                ip_lock = ip_lock + 1
            else:
                self.proxyip_db.delData(proxy)
                ip_fail = ip_fail + 1

        print '====> available ip: ', ip_ok, ' , lock ip: ', ip_lock, ' , fail ip: ', ip_fail, ' <===='
        threadPool.stopThreads()
Exemplo n.º 4
0
 def __init__(self, inbox, outbox, hostID, queue, monitorInterval):
     # Save arguments
     tpSize = config.getint('irs', 'thread_pool_size') / 2
     waitTimeout = 3
     maxTasks = config.getint('irs', 'max_tasks')
     self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
     self._stop = False
     self._flush = False
     self._queue = queue
     self._activeMessages = {}
     self._monitorInterval = monitorInterval
     self._hostID = int(hostID)
     self._used_slots_array = [0] * MESSAGES_PER_MAILBOX
     self._outgoingMail = EMPTYMAILBOX
     self._incomingMail = EMPTYMAILBOX
     # TODO: add support for multiple paths (multiple mailboxes)
     self._spmStorageDir = config.get('irs', 'repository')
     self._inCmd = [
         constants.EXT_DD, 'if=' + str(inbox), 'iflag=direct,fullblock',
         'bs=' + str(BLOCK_SIZE), 'count=' + str(BLOCKS_PER_MAILBOX),
         'skip=' + str(self._hostID * BLOCKS_PER_MAILBOX)
     ]
     self._outCmd = [
         constants.EXT_DD, 'of=' + str(outbox), 'iflag=fullblock',
         'oflag=direct', 'conv=notrunc', 'bs=' + str(BLOCK_SIZE),
         'seek=' + str(self._hostID * BLOCKS_PER_MAILBOX)
     ]
     self._init = False
     self._initMailbox()  # Read initial mailbox state
     self._msgCounter = 0
     self._sendMail()  # Clear outgoing mailbox
     self._thread = concurrent.thread(self.run,
                                      name="mailbox/hsm",
                                      logger=self.log.name)
     self._thread.start()
Exemplo n.º 5
0
 def __init__(self, ip, port, initial_nodes, period):
     super(DiscoveryService, self).__init__(name='Discovery',
                                            ip=ip,
                                            port=port)
     self.period = period
     self.nodes = initial_nodes
     self.threadpool = ThreadPool(2)
Exemplo n.º 6
0
 def __init__(self, url, depth, threadNum, dbfile, key):
     #要获取url的队列
     self.urlQueue = Queue()
     #读取的html队列
     self.htmlQueue = Queue()
     #已经访问的url
     self.readUrls = []
     #未访问的链接
     self.links = []
     #线程数
     self.threadNum = threadNum
     #数据库文件名
     self.dbfile = dbfile
     #创建存储数据库对象
     self.dataBase = SaveDataBase(self.dbfile)
     #指点线程数目的线程池
     self.threadPool = ThreadPool(self.threadNum)
     #初始化url队列
     self.urlQueue.put(url)
     #关键字,使用console的默认编码来解码
     self.key = key.decode(getdefaultlocale()[1])
     #爬行深度
     self.depth = depth
     #当前爬行深度
     self.currentDepth = 1
     #当前程序运行状态
     self.state = False
Exemplo n.º 7
0
 def __init__(self, url, depth, threadNum, dbfile, key):
     #瑕佽幏鍙杣rl鐨勯槦鍒�
     self.urlQueue = Queue()
     #璇诲彇鐨刪tml闃熷垪
     self.htmlQueue = Queue()
     #宸茬粡璁块棶鐨剈rl
     self.readUrls = []
     #鏈闂殑閾炬帴
     self.links = []
     #绾跨▼鏁�
     self.threadNum = threadNum
     #鏁版嵁搴撴枃浠跺悕
     self.dbfile = dbfile
     #鍒涘缓瀛樺偍鏁版嵁搴撳璞�
     self.dataBase = SaveDataBase(self.dbfile)
     #鎸囩偣绾跨▼鏁扮洰鐨勭嚎绋嬫睜
     self.threadPool = ThreadPool(self.threadNum)
     #鍒濆鍖杣rl闃熷垪
     self.urlQueue.put(url)
     #鍏抽敭瀛�浣跨敤console鐨勯粯璁ょ紪鐮佹潵瑙g爜
     self.key = key.decode(getdefaultlocale()[1])
     #鐖娣卞害
     self.depth = depth
     #褰撳墠鐖娣卞害
     self.currentDepth = 1
     #褰撳墠绋嬪簭杩愯鐘舵�
     self.state = False
Exemplo n.º 8
0
def main():
    try:
        f = open(r'ip.txt', 'rb')
        ip = ''
        for line in f.readlines():
            final_ip = line.strip('\n')
            for i in get_ip_list(final_ip):
                print i
                ip += str(i).strip() + '\n'
        with open(r'scan_ip.txt', 'w') as ff:
            ff.write(ip)
        data = []
        items = portscan()  # 进行masscan跑端口
        dataList = {}

        for i in items:
            i = i.split('|')
            if i[1] not in dataList:
                dataList[str(i[1])] = []
            dataList[str(i[1])].append(i[0])
        for i in dataList:
            if len(dataList[i]) >= 50:
                for port in dataList[i]:
                    items.remove(str(port) + '|' + str(i))  # 删除超过50个端口的
        pool = ThreadPool(20, 1000)
        pool.start(
            NmapScan,
            items,
            data,
        )
    except Exception as e:
        print e
        pass
Exemplo n.º 9
0
def Principal():
    thread = ThreadPool(10)
    while True:
        cnx, end = s.accept()
        print "o seguinte endereço se conectou: " + end[0]
        clientes.append(cnx)
        thread.insert_job(novo_cliente, cnx, end)
Exemplo n.º 10
0
 def __init__(self,
              tpSize=config.getfloat('irs', 'thread_pool_size'),
              waitTimeout=3,
              maxTasks=config.getfloat('irs', 'max_tasks')):
     self.storage_repository = config.get('irs', 'repository')
     self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
     self._tasks = {}
     self._unqueuedTasks = []
Exemplo n.º 11
0
 def testThreadPool(self):
     allTheThreads = []
     with ThreadPool( 10 ) as tp: 
         for i in range(200):
             w = MockWorker( None, None, None, None, f"Thread {i}" )
             allTheThreads.append( w )
             tp.addWorker( w )
     for thread in allTheThreads: 
         self.assertFalse( thread.is_alive() )
Exemplo n.º 12
0
 def __init__(self, args):
     self.depth = args.depth
     self.currentDepth = 1
     self.database = database(args.dbFile)
     self.threadPool = ThreadPool(args.threadNum)
     self.visitUrls = set()
     self.unvisitedUrls = deque()
     self.unvisitedUrls.append(args.url)
     self.isCrawling = False
     self.maxWebPages = args.maxWebPages
Exemplo n.º 13
0
 def __init__(self, ip, port, nodes, timeout, directory,
              file_transfer_service):
     super(CheckFileService, self).__init__(name='CheckFile',
                                            ip=ip,
                                            port=port)
     self.nodes = nodes
     self.directory = directory
     self.client_socket.settimeout(timeout)
     self.threadpool = ThreadPool(2)
     self.file_transfer_service = file_transfer_service
Exemplo n.º 14
0
 def __init__(self, args):
     self.depth = args.depth  
     self.currentDepth = 1  
     self.keyword = args.keyword.decode(getdefaultlocale()[1]) 
     self.database =  Database(args.dbFile)
     self.threadPool = ThreadPool(args.threadNum)  
     self.visitedHrefs = set()   
     self.unvisitedHrefs = deque()    
     self.unvisitedHrefs.append(args.url) 
     self.isCrawling = False
Exemplo n.º 15
0
def clientThreadMain():
    ''' Cria-se 20 threads pre-alocadas'''
    thread = ThreadPool(20)
    ''' Laco principal do servidor '''
    while True:

        conexao, endereco = server.accept()

        print endereco[0] + " conectou!"
        ''' Quando um cliente se conecta, eh adicionado a uma lista de clientes (usado para o broadcast) '''
        clientes.append(conexao)
        thread.insert_job(newClient, conexao, endereco)
Exemplo n.º 16
0
 def __init__(self, ip, port, node_list, directory, timeout):
     self.check_file_service = CheckFileService(
         ip=ip,
         port=3001,
         nodes=node_list,
         timeout=timeout,
         directory=directory,
     )
     self.file_transfer_service = FileTransferService(ip=ip,
                                                      port=3002,
                                                      directory=directory)
     self.directory = directory
     self.threadpool = ThreadPool(2)
Exemplo n.º 17
0
def main():
    node_list = ["0.0.0.0", "localhost"]
    main_pool = ThreadPool(3)
    discovery_service = DiscoveryService(ip="127.0.0.1",
                                         port=3000,
                                         initial_nodes=node_list,
                                         period=5)
    file_service = FileService(ip="127.0.0.1",
                               port=3001,
                               node_list=node_list,
                               directory='files/',
                               timeout=5)

    main_pool.add_task(discovery_service.start_service)
    main_pool.add_task(file_service.start_service)
    main_pool.wait_completion()
Exemplo n.º 18
0
 def start(self):
     with ThreadPool( self.max_jobs ) as tp :
         for url_to_visit in self.urls_provider : 
             if not self.exclusions.isExcluded( url_to_visit ) :
                 logging.info( f"visiting url {url_to_visit.value}..." )
                 try:
                     self._waitUntilWorkingHour()
                     w = Worker( self.user_agent,
                                 self.sentenceProcessor, 
                                 self.urlProcessor, 
                                 self.webSiteInfoProvider,
                                 self.MINIMUM_WORDS_PER_SENTENCE, 
                                 url_to_visit.value )
                     tp.addWorker( w )
                 except Exception as ex:
                     logging.error( f"Error fetching url {url_to_visit.value}") 
                     logging.error( ex )
Exemplo n.º 19
0
 def __init__(self, args):
     #指定网页深度
     self.depth = args.depth
     #表示爬虫深度,从1开始
     self.currentDepth = 1
     #数据库
     self.database = Database(args.dbFile)
     #线程池,指定线程数
     self.threadPool = ThreadPool(args.threadNum)
     #已经访问的链接
     self.visitedHrefs = set()
     #待访问的页面
     self.unvisitedHrefs = deque()
     #首个待访问的页面
     self.url = args.url
     self.unvisitedHrefs.append(args.url)
     #标记爬虫是否开始执行
     self.isCrawling = False
Exemplo n.º 20
0
 def __init__(self, args):
     #指定网页深度
     self.depth = args.depth
     #标注初始爬虫深度,从1开始
     self.currentDepth = 1
     #指定关键词,使用console的默认编码来解码
     self.keyword = args.keyword.decode(getdefaultlocale()[1])
     #数据库
     self.database = Database()
     #线程池,指定线程数
     self.threadPool = ThreadPool(args.threadNum)
     #已访问的链接
     self.visitedHrefs = set()
     #待访问的链接
     self.unvisitedHrefs = deque()
     #添加首个待访问的链接
     self.unvisitedHrefs.append(args.url)
     #标记爬虫是否开始执行任务
     self.isCrawling = False
Exemplo n.º 21
0
def saveProxies():
    threadPool = ThreadPool(30)
    threadPool.startThreads()

    proxyFileOK = open('proxyOK.txt', 'a')
    proxyFileFail = open('proxyFail.txt', 'a')
    for proxy in proxiex:
        threadPool.putTask(checkProxy, proxy)
    while threadPool.getTaskLeft():
        flag, proxy = threadPool.getTaskResult()
        print flag, proxy
        if flag == 'ok':
            proxyFileOK.write(proxy)
            proxyFileOK.write('\n')
        else:
            proxyFileFail.write(proxy)
            proxyFileFail.write('\n')

    threadPool.stopThreads()
    proxyFileOK.close()
    proxyFileFail.close()
Exemplo n.º 22
0
    def __init__(self, dbName, threadNum, logLevel, startUrls, depth, keyword, downloadMode):
        self.__threadNum = threadNum
        self.__startUrls = startUrls
        self.__depth = depth
        self.__keyword = keyword
        self.__downloadMode = downloadMode
        self.__dbName = dbName
        self.__logLevel = logLevel
        
        self.__exitEvent = threading.Event()
        # url队列存储待下载的url节点
        self.__urlQueue = Queue.Queue()
        # html队列存储已经下载完成等待解析的html节点
        self.__htmlQueue = Queue.Queue()
        # data队列存储已解析完成并符合存入数据库条件的html节点
        self.__dataQueue = Queue.Queue()
        # 存储为各个下载模块分配的下载队列
        self.__downloadQueueList = []
	# 创建线程池
        self.__threadPool = ThreadPool(threadNum + 2)
        self.__downloadingFlag = 0
Exemplo n.º 23
0
def main(server, fileManager):
    thread_pool = ThreadPool(thread_number=2,
                             target=worker,
                             args=(server, handler, '/var/www/html',
                                   fileManager))
    thread_pool.start()
Exemplo n.º 24
0
 def __init__(self, ip, port, directory):
     super(FileTransferService, self).__init__(name='FileTransfer',
                                               ip=ip,
                                               port=port)
     self.directory = directory
     self.threadpool = ThreadPool(2)