Пример #1
0
def test():
    print 'start testing'
    wm = ThreadPool(10)
    for i in range(1):
        wm.add_job(test_job, i, i * 0.001)
    wm.wait_for_complete()
    print 'end testing'
Пример #2
0
def test():
    print 'start testing'
    wm = ThreadPool(10)
    for i in range(1):
        wm.add_job(test_job, i, i*0.001)
    wm.wait_for_complete()
    print 'end testing'
Пример #3
0
class Interface(object):
    def __init__(self):

        self._read_config()
        self._init_threadpool()

    def _read_config(self):
        self.pipe_file = Base.get_config("QUEUE", "PIPE_FILE")
        self.queue_size = Base.get_config("QUEUE", "QUEUE_SIZE")
        self.thread_pool_num = Base.get_config("THREADPOOL", "NUM")
        self.pipe_fd = os.open(self.pipe_file, os.O_NONBLOCK | os.O_CREAT | os.O_RDWR)

    def _init_threadpool(self):
        self.pool = ThreadPool(int(self.thread_pool_num), int(self.queue_size))

    def write(self, string):
        print string

    def transcode(self, string):
        self.pool.add_job(self._transcode, string)

    def _transcode(self, filepath):

        print filepath
        time.sleep(10)
        print "ok"

    def __getattribute__(self, name):

        try:
            res = object.__getattribute__(self, name)

        except:
            res = None

        return res

    def __del__(self):

        os.close(self.pipe_fd)
Пример #4
0
class MessageBus(object):
    """ 消息总线
        用于发送消息和桥接bot和命令
        接收消息分发给群成员
        处理消息命令,指派给相应的命令处理
        供命令处理返回命令或广播命令结果
    """
    def __init__(self, bot_jid, stream):
        self.bot_jid = bot_jid
        self._stream = stream
        self.cmd_handler = CommandHandler(message_bus = self)
        self.admin_cmd_handler = AdminCMDHandler(message_bus = self)
        self._thread_pool = ThreadPool(5)
        self._thread_pool.start()         # 启动线程池
        self.logger = get_logger()
        return

    def make_message(self, to, typ, body):
        """ 构造消息
            `to` - 接收人 JID
            `typ` - 消息类型
            `body` - 消息主体
        """
        if typ not in ['normal', 'chat', 'groupchat', 'headline']:
            typ = 'normal'
        m = Message(from_jid = self.bot_jid, to_jid = to, stanza_type = typ,
                    body = body)
        return m

    def send_to_admin(self, stanza, body):
        """ 给管理员发送消息 """
        [self.send_message(stanza, admin, body, True) for admin in ADMINS]

    def send_private_msg(self, stanza, to, body):
        """ 发送私信 """
        frm = stanza.from_jid
        nick = get_nick(frm)
        body = "[%s 悄悄对你说] %s" % (nick, body)
        self.send_message(stanza, to, body, True)

    def send_message(self, stanza, to, body, log = False):
        """ 发送消息
            `stanza`   - 消息节
            `to`       - 接收人 接收人不在线发送离线消息
            `body`     - 消息主体
            `log`      - 记录历史消息
        """
        if log:
            add_history(stanza.from_jid, to, body)
        if is_online(to):
            mode = get_info('mode', to)
            if mode == 'talk' or not mode:
                if isinstance(to, (str, unicode)):
                    to = JID(to)
                self.logger.debug("send '{0}' to {1!r}".format(body, to))
                typ = stanza.stanza_type
                self._stream.send(self.make_message(to, typ, body))
        else:
            body = NOW() + ' ' + body
            self.logger.debug("store offline message'{0}' for {1!r}"
                                    .format(body, to))
            offline_message = get_info('offline_message', to, '')
            offline_message += '\n' +  body
            add_info('offline_message', offline_message, to)

    def send_offline_message(self, stanza):
        """ 发送离线消息 """
        show = stanza.show
        frm = stanza.from_jid
        offline_message = get_info('offline_message', frm)
        if offline_message:
            offline_message = "离线期间的消息:\n" + offline_message
            m = self.make_message(frm, 'normal', offline_message)
            self._stream.send(m)
            set_online(frm, show)
            add_info('offline_message', '', frm)

    def send_all_msg(self, stanza, body):
        """ 给除了自己的所有成员发送消息 """
        if cityid(body.strip()):
            return self.send_command(stanza, '-_tq ' + body.strip())
        if body.strip() == 'help':
            return self.send_command(stanza, '-help')
        if body.strip() == 'ping':
            return self.send_command(stanza, '-_ping')
        mode = get_info('mode', stanza.from_jid)
        if mode == 'quiet':
            body = u'你处于{0},请使用-cd命令切换到 {1} '\
                    u'后发言'.format(MODES[mode], MODES['talk'])
            return self.send_back_msg(stanza, body)

        add_history(stanza.from_jid, 'all', body)
        members = get_members(stanza.from_jid)
        current = get_info('channel', stanza.from_jid, 'main')
        members = [m for m in members
                   if get_info('channel', m, 'main') == current]
        self.logger.info("{0} send message {1} to {2!r}"
                            .format(stanza.from_jid, body, members))
        nick = get_nick(stanza.from_jid)
        body = "[{0}] {1}".format(nick, body)
        [self.send_message(stanza, m, body) for m in members]

    def send_back_msg(self, stanza, body):
        """ 发送返回消息 """
        to = stanza.from_jid.bare().as_string()
        typ = stanza.stanza_type
        self._stream.send(self.make_message(to, typ, body))

    def send_sys_msg(self, stanza, body):
        """ 发送系统消息 """
        members = get_members()
        [self.send_message(stanza, m, body) for m in members]

    def send_command(self, stanza,  body):
        """ 处理命令
            为防止阻塞使用线程池处理命令
        """
        email = get_email(stanza.from_jid)
        self.logger.info("{0} run command {1}".format(stanza.from_jid, body))
        if email in ADMINS:
            target = self.admin_cmd_handler._run_cmd
        else:
            target = self.cmd_handler._run_cmd
        self._thread_pool.add_job(target, stanza, body)

    def send_status(self, statustext, to = None):
        if to:
            to_jid = JID(to)
            p = Presence(status=statustext, to_jid = to_jid)
        else:
            p = Presence(status = statustext)
        self._stream.send(p)

    def send_subscribe(self, jid):
        """ 发送订阅 """
        p1 = Presence(from_jid = self.bot_jid, to_jid = jid,
                      stanza_type = 'subscribe')
        p = Presence(from_jid = self.bot_jid, to_jid = jid,
                     stanza_type = 'subscribed')
        self._stream.send(p)
        self._stream.send(p1)

    def send_unsubscribe(self, jid):
        p1 = Presence(from_jid = self.my_jid, to_jid = jid,
                      stanza_type = 'unsubscribe')
        p = Presence(from_jid = self.my_jid, to_jid = jid,
                     stanza_type = 'unsubscribed')
        self._stream.send(p)
        self._stream.send(p1)
Пример #5
0
class Interface(object):

    img_url_exp = re.compile(r'http://qdimg.okjiaoyu.cn/[\S\s]*')
    qiniu_prefix = 'http://%s.okjiaoyu.cn/%s'

    def __init__(self):

        self._read_config()
        self._init_threadpool()
        self._get_pid()

    def _read_config(self):

        configer = Configer()
        self.queue_size = configer.get_configer('QUEUE', 'queue_size')
        self.thread_pool_num = configer.get_configer('THREADPOOL', 'num')

    def _init_threadpool(self):

        self.pool = ThreadPool(int(self.thread_pool_num), int(self.queue_size))

    def _get_pid(self):

        self.pid = os.getpid()

    def write(self, string):

        print string

    def local_img(self, string):
        from gl import LOG
        update_flag = False
        LOG.info('start local img,question id [%s]' % string)
        question_id = int(string)
        mongo = Mongo()
        mongo.connect('resource')
        mongo.select_collection('mongo_question_json')
        json = mongo.find_one({'question_id': question_id}, {'content': 1})
        mongo.select_collection('mongo_question_html')
        html = str(mongo.find_one({'question_id': question_id},
                                  {'content': 1}))
        #img_expr = parse("content[*].*[*]")

        #img_list =  [match.value for match in img_expr.find(json) if isinstance(match.value,dict) and\
        #             'type' in match.value.keys() and match.value['type'] == 'image']

        #pprint.pprint(json)
        content = ''

        if json:
            content = json['content']

            for key, wrap in content.items():
                for idx, item in enumerate(content[key]):
                    if isinstance(item, str):
                        continue

                    if isinstance(item, dict):
                        if 'group' in item.keys():
                            group = item['group']
                            for index, item1 in enumerate(group):
                                if isinstance(
                                        item1, dict) and 'type' in item1.keys(
                                        ) and item1['type'] == 'image':
                                    ori_url = item1['value']
                                    qiniu_url = self._upload_qiniu(ori_url)
                                    if qiniu_url:
                                        content[key][idx]['group'][index][
                                            'value'] = qiniu_url
                                        update_flag = True
                                        html = html.replace(ori_url, qiniu_url)

                        if 'type' in item.keys() and item['type'] == 'image':
                            ori_url = item['value']
                            qiniu_url = self._upload_qiniu(ori_url)
                            if qiniu_url:
                                content[key][idx]['value'] = qiniu_url
                                update_flag = True
                                html = html.replace(ori_url, qiniu_url)

                    if isinstance(item, list):
                        for index, item1 in enumerate(item):
                            if 'type' in item1.keys(
                            ) and item1['type'] == 'image':
                                ori_url = item1['value']
                                qiniu_url = self._upload_qiniu(ori_url)

                                if qiniu_url:
                                    content[key][idx][index][
                                        'value'] = qiniu_url
                                    update_flag = True
                                    html = html.replace(ori_url, qiniu_url)

        if update_flag:
            mongo.select_collection('mongo_question_json')
            json_effected = mongo.update_many({'question_id': question_id},
                                              {'$set': {
                                                  'content': content
                                              }})
            mongo.select_collection('mongo_question_html')
            html_effected = mongo.update_many({'question_id': question_id},
                                              {'$set': {
                                                  'content': html
                                              }})
            LOG.info('mongo update successful json[%d] -- html[%d]' %
                     (json_effected, html_effected))

    def _upload_qiniu(self, ori_url):
        from gl import LOG
        LOG.info('Original Image Url [%s]' % ori_url)
        if not self.img_url_exp.match(ori_url):
            suffix = ori_url[ori_url.rfind('.'):]
            qiniu_file_name = md5(ori_url).hexdigest() + suffix

            LOG.info('Open Refer Imgage[%s]' % ori_url)

            request = urllib2.Request(ori_url)
            response = urllib2.urlopen(request)
            img_data = response.read()

            #LOG.info('img data [%s]' % img_data)

            qiniu = QiniuWrap()
            res = qiniu.upload_data('qdimg', qiniu_file_name, img_data)

            if not res:
                qiniu_url = self.qiniu_prefix % ('qdimg', qiniu_file_name)
                LOG.info('[%s] local [%s] successful' % (ori_url, qiniu_url))
                return qiniu_url
            else:
                LOG.error('upload qiniu error [%s]' % res)
                return None

    def transcode(self, string):
        self.pool.add_job(self._transcode, string)
        #通知idc_api转码完成

    def _transcode(self, filepath):

        print filepath
        time.sleep(100)
        print 'ok'

    def kill(self):
        os.kill(self.pid, signal.SIGKILL)

    def __getattribute__(self, name):

        try:
            res = object.__getattribute__(self, name)

        except:
            res = None

        return res
Пример #6
0
class Crawler():
    def __init__(self, myconfig):
        # 线程池, 
        self.thread_pool = ThreadPool(myconfig.threadnum)
        # 已访问的url集合
        self.visited_urls = set()
        # set 不是线程安全,所以这里加一把锁
        self.visited_urls_lock = threading.Lock()
        # 未访问的url集合
        self.will_visited_urls = deque()
        self.will_visited_urls.append(myconfig.url)
        self.temp_q = deque()
        self.cur_depth = 0
        self.status = ""
        self.myconfig = myconfig
        MyLogger(myconfig.logfile, myconfig.loglevel)
        #MyLogger(myconfig.logfile, loglevel = 5)  # debug
        self.db = Db()
        
    
    def start(self):
        self.status = "start"
        while self.cur_depth < self.myconfig.depth:
            if self.status == "stop":
                break
            try:
                while self.will_visited_urls:
                    url = self.will_visited_urls.popleft()
                    # 添加工作,这里基本上没有阻塞,因为是在主线程里,只是负责
                    # 添加工作,真正执行工作是在线程里做的
                 
                    self.thread_pool.add_job(self.handler, url)
                #
                # TODO:
                # 通知线程有活干了,这里可以看出是在将will_visited_urls的url
                # 都添加后才通知线程去干活的,这样设计,粒度似乎有点粗?
                # 如果还想节省时间的话,可以在url的数目 >= 线程初始数目的时候,就通知
                # 线程池里的线程开始干活,如果url的数目 < 线程初始数目的时候,等都
                # 添加完之后,再通知
                
                #print ">>>>>>>>  give event to threads in thread pool"
                # 通知线程池里的线程开始新一轮的抓取
                self.thread_pool.event_do_job()
                # 主动退出调度,让子线程有时间可以执行
                time.sleep(3)
            except Empty:
                # 需要访问的url没有了
                logging.info("no url right now")
            finally:
                
                # 必须等线程池里的线程工作做完之后,才算本次深度的访问结束
                # 这里做的处理是如果线程池里面有线程,则睡3s,再读,
                # 直到线程池里的工作线程为0才停下来
                # 这样才算本次深度的抓取完毕
                while True:
                    #print "thread waiting num is %d, config thread num is %d" % (self.thread_pool.get_thread_waiting_num(), self.myconfig.thread)
                    if self.thread_pool.get_thread_waiting_num() == self.myconfig.threadnum:
                        # 如果等待的线程数目等于线程初始数目,则说明,所有线程都执行完毕
                        # 所以break
                        break
                    else:
                        # 有线程仍然在执行,则说明, 本次深度的访问还没有结束
                        # 睡眠等待
                        time.sleep(10)
                #此次深度的访问结束,深度加一
                self.cur_depth += 1
                logging.info("crawler depth now is %s" % str(self.cur_depth))
                if self.cur_depth > self.myconfig.depth:
                    break
                # 从url中抓到的网页都放到了temp_q中,
                # 将temp_q中的网页从新给 will_visited_urls,继续
                self.will_visited_urls = self.temp_q
                self.temp_q = deque()
                
                
        # 所有深度的url都抓取完毕 or 爬虫退出
        self.thread_pool.stop_threads()
        logging.info("crawler exit")
        return
        
            
    def handler(self, url):
        content= self.get_html_content(url)
        if content == "" or content == None:
            # 无法获取content,直接返回
            return
        # 添加此url为已访问过
        self.add_url_to_visited(url)
        if content.find(self.myconfig.key) != -1:
            self.db.save_data(url, self.myconfig.key, content)
        try:
            hrefs = self.get_hrefs(content, url)
        except StandardError, se:
            logging.error("error: %s" % (se))
            print se
            # log
            # 无法获取 hrefs
            return
        # 如果获得了hrefs
        if hrefs:
            # 将hrefs添加到 temp_q中,等本级深度访问完毕之后再访问
            for link in hrefs:
                # 最后的考验
                if not self.is_url_visited(link) \
                            and link not in self.will_visited_urls \
                            and link not in self.temp_q:
                    #print "put %s into temp_q" % link 
                    self.temp_q.append(link)
Пример #7
0
class Interface(object):

	img_url_exp = re.compile(r'http://qdimg.okjiaoyu.cn/[\S\s]*')
	qiniu_prefix = 'http://%s.okjiaoyu.cn/%s'
	
	def __init__(self):
			
		self._read_config()
		self._init_threadpool()
		self._get_pid()

	def _read_config(self):

		configer = Configer()
 		self.queue_size = configer.get_configer('QUEUE','queue_size')
                self.thread_pool_num = configer.get_configer('THREADPOOL','num')
	
	def _init_threadpool(self):	

		self.pool = ThreadPool(int(self.thread_pool_num),int(self.queue_size))

	def _get_pid(self):
		
		self.pid = os.getpid()

	def write(self,string):

		print string

	def local_img(self,string):
		from gl import LOG
		update_flag = False
		LOG.info('start local img,question id [%s]' % string)
		question_id = int(string)
		mongo = Mongo()
		mongo.connect('resource')
		mongo.select_collection('mongo_question_json')
		json = mongo.find_one({'question_id':question_id},{'content':1})
		mongo.select_collection('mongo_question_html')
		html = str(mongo.find_one({'question_id':question_id},{'content':1}))
		#img_expr = parse("content[*].*[*]")

		#img_list =  [match.value for match in img_expr.find(json) if isinstance(match.value,dict) and\
		#             'type' in match.value.keys() and match.value['type'] == 'image']

		#pprint.pprint(json)
		content = ''

		if json:
			content = json['content']

			for key,wrap in content.items():
				for idx,item in enumerate(content[key]):
					if isinstance(item,str):
						continue

					if isinstance(item,dict):
						if 'group' in item.keys():
							group = item['group']
							for index,item1 in enumerate(group):
								if isinstance(item1,dict) and 'type' in item1.keys() and item1['type'] == 'image':
									ori_url = item1['value']
									qiniu_url = self._upload_qiniu(ori_url)
									if qiniu_url:
										content[key][idx]['group'][index]['value'] = qiniu_url
										update_flag = True
										html = html.replace(ori_url,qiniu_url)


						if 'type' in item.keys() and item['type'] == 'image':
							ori_url = item['value']
							qiniu_url = self._upload_qiniu(ori_url)
							if qiniu_url:
								content[key][idx]['value'] = qiniu_url
								update_flag = True
								html = html.replace(ori_url,qiniu_url)

					if isinstance(item,list):
						for index,item1 in enumerate(item):
							if 'type' in item1.keys() and item1['type'] == 'image':
								ori_url = item1['value']
								qiniu_url = self._upload_qiniu(ori_url)
								
								if qiniu_url:
									content[key][idx][index]['value'] = qiniu_url
									update_flag = True
									html = html.replace(ori_url,qiniu_url)

		if update_flag:
                	mongo.select_collection('mongo_question_json')
			json_effected = mongo.update_many({'question_id':question_id},{'$set':{'content':content}})
			mongo.select_collection('mongo_question_html')
			html_effected = mongo.update_many({'question_id':question_id},{'$set':{'content':html}})
			LOG.info('mongo update successful json[%d] -- html[%d]' % (json_effected,html_effected))
			
	def _upload_qiniu(self,ori_url):
		from gl import LOG
		LOG.info('Original Image Url [%s]' % ori_url)
		if not self.img_url_exp.match(ori_url):
			suffix = ori_url[ori_url.rfind('.'):]	
			qiniu_file_name = md5(ori_url).hexdigest() + suffix
			
			LOG.info('Open Refer Imgage[%s]' % ori_url)

			request = urllib2.Request(ori_url)
			response = urllib2.urlopen(request)
			img_data =  response.read()

			#LOG.info('img data [%s]' % img_data)	

			qiniu = QiniuWrap()
			res = qiniu.upload_data('qdimg',qiniu_file_name,img_data)

			if not res:
				qiniu_url = self.qiniu_prefix % ('qdimg',qiniu_file_name)
				LOG.info('[%s] local [%s] successful' % (ori_url,qiniu_url))			
				return qiniu_url
			else:
				LOG.error('upload qiniu error [%s]' % res)
				return None

				

	def transcode(self,string):
		self.pool.add_job(self._transcode,string)
		#通知idc_api转码完成

	def _transcode(self,filepath):
		
		print filepath
		time.sleep(100)
		print 'ok'

	def kill(self):
		os.kill(self.pid,signal.SIGKILL)

	def __getattribute__(self,name):
		
		try:
			res = object.__getattribute__(self,name)

		except:
			res = None	

		return res
Пример #8
0
import os
import sys
from thread_pool import ThreadPool


def myTest(s1, s2):
    print s1, s2


tp = ThreadPool(3)
for i in xrange(3):
    tp.add_job(myTest, str(i), str(i + 10))
tp.begin_to_finish()
Пример #9
0
import os
import sys
from thread_pool import ThreadPool

def myTest(s1, s2):
    print s1,s2

tp = ThreadPool(3)
for i in xrange(3):
    tp.add_job(myTest, str(i), str(i+10))
tp.begin_to_finish()