示例#1
0
class InputThread(threading.Thread):
    def __init__(self, beanstalk_conf, log=None, process_pool=None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True
        self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.input_tube = beanstalk_conf['input_tube']
        self.output_tube = beanstalk_conf['output_tube']
        self.log = log
        if not self.log:
            self.log = LogHandler("i_input_thread")

        self.process_pool = process_pool
        self.wlock = threading.Lock()

    def stop(self):
        self.log.warning("stop input_thread")
        self.running = False
        proccesor = None
        try:
            while True:
                if self.process_pool.get_task_num() == 0:
                    if self.process_pool.thread_local_constructors.has_key('processor'):
                        processor = self.process_pool.thread_local_constructors['processor'][1][1]
                        self.log.warning("prepare call scheduler_processor to stop scheduler")
                        processor.save_status()
                        break
                else:
                    self.log.info("wait tasks be consumed over, wait 5s")
                    time.sleep(5)

            self.beanstalk.__del__()  # 关闭连接不再接受数据
        except Exception, e:
            self.log.error("stop input_thread fail:%s" % e.message)
示例#2
0
    def __init__(self, log, selector_conf, beanstalk_conf, scheduler=None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = False
        self.log = log
        # 下载统计信息

        self.site_static = {}
        self.scheduler = scheduler
        self.download_req_num = 0
        # 下载器配置信息
        # self.downloaders = []
        self.downloader_num = 0
        # self.downloader_conf = downloader_conf

        # for downloader in self.downloader_conf:
        #     try:
        #         self.downloaders.append(ThriftDownloader(downloader['host'], downloader['port']))
        #         self.downloader_num += 1
        #     except Exception, e:
        #         self.log.error('Add_downloader\t' + traceback.format_exc())
        # 选择器配置
        self.selector_conf = selector_conf
        # beanstalk 队列设置
        self.beanstalk_conf = beanstalk_conf
        self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'],
                                         beanstalk_conf['port'])
        self.output_tube = beanstalk_conf['output_tube']
        self.wlock = threading.Lock()
示例#3
0
文件: tool.py 项目: mylove1/crawler-2
def put_beanstalked(beanstalk_conf, log, rsp):
    beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
    tube = beanstalk_conf['input_tube']
    str_page_info = to_string(log, rsp)
    try:
        beanstalk.put(tube, str_page_info)
        log.info('beanstalk\turl:%s\ttube:%s' % (rsp.url, tube))
    except Exception as e:
        log.info('beanstalk put error url:' + rsp.url + '\ttube:' + tube)
示例#4
0
 def __init__(self, beanstalk_conf, log):
     self._queue = Queue()
     self._log = log
     self.beanstalk = PyBeanstalk(beanstalk_conf['host'],
                                  beanstalk_conf['port'])
     self.beanstalk_conf = beanstalk_conf
     threading.Thread.__init__(self)
     self.daemon = True
     self.running = True
示例#5
0
def deliver_req():
    out_beanstalk = PyBeanstalk('172.18.180.223', 11300)
    while True:
        try:
            priority, reqs = index_queue.get_nowait()
            req_str = req_to_string(reqs)
            out_beanstalk.put('online_download_req', req_str)
        except Empty:
            continue
            time.sleep(6)
示例#6
0
 def put_beanstalkd(self, tube_name, obj):
     str_page_info = self.to_string(obj)
     try:
         self.beanstalk.put(tube_name, str_page_info)
         self._log.info('put beanstalk \ttube:%s success' % (tube_name, ))
     except SocketError as e:
         self._log.error('beanstalk connect failed, {}'.format(e.message))
         self.beanstalk = PyBeanstalk(self.beanstalk_conf['host'],
                                      self.beanstalk_conf['port'])
     except Exception as e:
         self._log.info('beanstalk put tube{} error {}'.format(
             tube_name, str(traceback.format_exc())))
    def __init__(self, log, conf):
        self.log = log
        self.conf = conf

        assert log is not None
        assert isinstance(conf, dict)

        self.type_extractor_map = self.conf['type_extractor_map']
        self.smart_proxy_url = self.conf['smart_proxy_url']

        self.out_beanstalk = PyBeanstalk(self.conf['beanstalk_conf']['host'], self.conf['beanstalk_conf']['port'])
        self.output_tube_scheduler = self.conf['beanstalk_conf']['output_tube_scheduler']
示例#8
0
class PutBeanstaldServer(threading.Thread):
    def __init__(self, beanstalk_conf, log):
        self._queue = Queue()
        self._log = log
        self.beanstalk = PyBeanstalk(beanstalk_conf['host'],
                                     beanstalk_conf['port'])
        self.beanstalk_conf = beanstalk_conf
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True

    def to_string(self, page_info):
        str_page_info = None
        try:
            tMemory_b = TMemoryBuffer()
            tBinaryProtocol_b = TBinaryServerProtocol(tMemory_b)
            page_info.write(tBinaryProtocol_b)
            str_page_info = tMemory_b.getvalue()
        except EOFError as e:
            self._log.warning("cann't write data to string")
        return str_page_info

    def put_beanstalkd(self, tube_name, obj):
        str_page_info = self.to_string(obj)
        try:
            self.beanstalk.put(tube_name, str_page_info)
            self._log.info('put beanstalk \ttube:%s success' % (tube_name, ))
        except SocketError as e:
            self._log.error('beanstalk connect failed, {}'.format(e.message))
            self.beanstalk = PyBeanstalk(self.beanstalk_conf['host'],
                                         self.beanstalk_conf['port'])
        except Exception as e:
            self._log.info('beanstalk put tube{} error {}'.format(
                tube_name, str(traceback.format_exc())))

    def run(self):
        while True:
            record = self._queue.get()
            self._build_record_and_put(record)

    def get_tube_by_name(self, tube_name):
        return self.beanstalk_conf.get(tube_name, None)

    def _build_record_and_put(self, data):
        tube_name = data.get('tube_name', None)
        if not tube_name: return
        obj = data.get('obj', None)
        if not obj: return
        self.put_beanstalkd(tube_name, obj)

    def save_record(self, data):
        self._queue.put(data)
示例#9
0
 def __init__(self, conf):
     self.log = conf['log']
     self.conf = conf
     self.beanstalk_conf = conf['beanstalk_conf']
     try:
         self.mongo_client_web = PyMongo(
             self.conf['webpage_db']['host'],
             self.conf['webpage_db']['port'], self.conf['webpage_db']['db'],
             self.conf['webpage_db']['username'],
             self.conf['webpage_db']['password'])
         self.beanstalk_client = PyBeanstalk(self.beanstalk_conf['host'],
                                             self.beanstalk_conf['port'])
     except:
         self.log.error(traceback.format_exc())
示例#10
0
    def __init__(self, beanstalk_conf, log=None, process_pool=None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True
        self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.input_tube = beanstalk_conf['input_tube']
        self.output_tube = beanstalk_conf['output_tube']
        self.log = log
        if not self.log:
            self.log = LogHandler("i_input_thread")

        self.process_pool = process_pool
        self.wlock = threading.Lock()
示例#11
0
 def __init__(self, conf, convert, select_handler):
     self.conf = conf
     self.log = conf['log']
     self.convert = convert
     self.select_handler = select_handler
     self.beanstalk = PyBeanstalk(
         conf.get('beanstalk_conf').get('host'),
         conf.get('beanstalk_conf').get('port'))
示例#12
0
    def __init__(self, conf, processor=None, proc_name=None):
        threading.Thread.__init__(self)
        self.running = True
        self.proc_name = proc_name  # Only for logging
        self.input_tube = conf['beanstalk_conf']['input_tube']
        self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'],
                                     conf['beanstalk_conf']['port'])
        self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'],
                                         conf['beanstalk_conf']['port'])
        self.output_tube = conf['beanstalk_conf']['output_tube']
        self.topic_output_tubes = {}
        self.topic_output_tubes.setdefault('default', [])
        """
            output_tube = ["default_out", "only_special_out:1,2,3:exclusive", "special_out:4", ":5:exclusive"]
            topic_id:1,2,3只会用到only_special_out
            topic_id:4 会进入special_out和default_out
            topic_id:5 不会进入队列
            topic_id:else 用用default_out队列
        """
        if type(self.output_tube) == list:
            for tube_def in self.output_tube:
                tube_def = tube_def.strip()
                if len(tube_def.split(":")) < 2:
                    self.topic_output_tubes['default'].append(tube_def)
                else:
                    elements = [a.strip() for a in tube_def.split(':')]
                    tube_name = elements[0]
                    topic_ids = [
                        int(a.strip()) for a in elements[1].split(',')
                    ]
                    exclusive = False
                    if len(elements) == 3 and elements[2] == 'exclusive':
                        exclusive = True
                    for topic_id in topic_ids:
                        self.topic_output_tubes.setdefault(topic_id, [])
                        self.topic_output_tubes[topic_id].append(
                            (tube_name, exclusive))
        else:
            self.topic_output_tubes['default'].append(self.output_tube)

        self.log = log
        if processor is None:
            log.error("Processor not given !")
            raise Exception("Processor not given !")
        else:
            self.processor = processor
示例#13
0
 def __init__(self, site, url_pattern=".*", test=True, parser_id=None):
     global logger
     self.beanstalk = PyBeanstalk("Crawler-Downloader1:Crawler-Downloader2", 11300)
     self.output_tube = 'download_rsp'
     self.output_queue = queue.Queue(maxsize = 1000)
     self.mongo_client = pymongo.MongoClient('Crawler-DataServer1', 40042)
     self.site = site
     self.url_pattern = url_pattern
     self.test = test
     self.parser_id = parser_id
示例#14
0
def thrput_task():
    input_tube='download_req'
    beanstalk = PyBeanstalk('101.201.102.37', 11300)
    client,transport=getclient()
    cnt=0
    start=time.time()
    suma=100
    while suma:
        suma-=1
        for i in pro.keys():
            try:
                req=getreq(proa=i)
                str_page_info = to_string(req)
                beanstalk.put(input_tube, str_page_info)
                cnt+=1
            except Exception as e:
                print e.message
        print ('usetime:{}'.format(time.time()-start))



    closetransport(transport)
示例#15
0
    def __init__(self, conf, processor, proc_name= None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True
        self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port'])
        self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port'])
        self.input_tube = conf['beanstalk_conf']['input_tube']
        self.output_tube = conf['beanstalk_conf']['output_tube']

        self.log = conf['log']
        if not self.log:
            self.log = LogHandler("i_input_thread")
        self.processor = processor
        if self.processor is None:
            self.log.error("Processor not given !")
            raise Exception("Processor not given !")

        self.processor_pool = ThreadPool(conf['server'].get("process_thread_num", 1),\
                                         {},\
                                         int(conf['server'].get("process_thread_num", 1))
                                         )
        self.wlock = threading.Lock()
示例#16
0
class InputThread(threading.Thread):
    def __init__(self, beanstalk_conf, log=None, process_pool=None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True

        assert beanstalk_conf is not None
        assert log is not None
        assert process_pool is not None

        self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.input_tube = beanstalk_conf['input_tube']
        self.output_tube = beanstalk_conf['output_tube']
        self.log = log
        if not self.log:
            self.log = LogHandler("i_input_thread")

        self.process_pool = process_pool
        self.t_lock = threading.Lock()

    def stop(self):
        self.log.warning("stop input_thread")
        self.running = False
        try:
            while True:
                if self.process_pool.get_task_num() <= 0:
                    # if 'processor' in self.process_pool.thread_local_constructors:
                    #     processor = self.process_pool.thread_local_constructors['processor'][1][1]
                    #     self.log.warning("prepare call scheduler_processor to stop scheduler")
                    #     processor.save_status()
                    break
                else:
                    self.log.info("wait tasks be consumed over, wait 5s")
                    time.sleep(5)

            self.beanstalk.__del__()  # 关闭连接不再接受数据
        except Exception as e:
            self.log.error("stop input_thread fail")
            self.log.exception(e)

    def run(self):
        job_num = 0
        while self.running and self.input_tube:
            try:
                job = self.beanstalk.reserve(self.input_tube, 30)
                if job is not None:
                    job_num += 1
                    body = job.body
                    job.delete()

                    self.process_pool.queue_task(self.__on_task_start, (body,), self.__on_task_finished)
                    task_num = self.process_pool.get_task_num()
                    if task_num >= 50:
                        self.log.info("place_processor\ttasks:%d" % task_num)
                        time.sleep(2)
                else:
                    self.log.info("not msg from:%s" % self.input_tube)
            except SocketError as e:
                time.sleep(30)
                self.log.error('beanstalk\tconnect\tfail\tstart\treconnect')
                self.log.exception(e)
                try:
                    self.beanstalk.reconnect()
                    self.out_beanstalk.reconnect()
                    self.log.error('beanstalk\treconnect\tsuccess')
                except Exception as e:
                    self.log.error('beanstalk\treconnect\tfail')
                    self.log.exception(e)
            except Exception as e:
                self.log.error("not msg from:%s\tresult:" % self.input_tube)
                self.log.exception(e)

    @staticmethod
    def __on_task_start(task, **thread_locals):
        result = None
        if 'profiler' in thread_locals:
            thread_locals['profiler'].begin()
        if 'processor' in thread_locals:
            result = thread_locals['processor'].do_task(task)
        return result

    def __on_task_finished(self, (result), **thread_locals):
        self.t_lock.acquire()
        proccesor = None
        if 'processor' in thread_locals:
            proccesor = thread_locals['processor']
        if 'profiler' in thread_locals:
            thread_locals['profiler'].end()
        if result and isinstance(result, basestring):
            self.__output_msg(result, proccesor)
        elif isinstance(result, list):
            for message in result:
                self.__output_msg(message, proccesor)
        self.t_lock.release()
示例#17
0
class CrawlSelector(threading.Thread):
    def __init__(self, log, selector_conf, beanstalk_conf, scheduler=None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = False
        self.log = log
        # 下载统计信息

        self.site_static = {}
        self.scheduler = scheduler
        self.download_req_num = 0
        # 下载器配置信息
        # self.downloaders = []
        self.downloader_num = 0
        # self.downloader_conf = downloader_conf

        # for downloader in self.downloader_conf:
        #     try:
        #         self.downloaders.append(ThriftDownloader(downloader['host'], downloader['port']))
        #         self.downloader_num += 1
        #     except Exception, e:
        #         self.log.error('Add_downloader\t' + traceback.format_exc())
        # 选择器配置
        self.selector_conf = selector_conf
        # beanstalk 队列设置
        self.beanstalk_conf = beanstalk_conf
        self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'],
                                         beanstalk_conf['port'])
        self.output_tube = beanstalk_conf['output_tube']
        self.wlock = threading.Lock()

    def req_to_string(self, req):
        str_req = ""
        try:
            tMemory_b = TMemoryBuffer()
            tBinaryProtocol_b = TBinaryProtocol(tMemory_b)
            req.write(tBinaryProtocol_b)
            str_req = tMemory_b.getvalue()
        except:
            self.log.error('crawled_failt\terror:%s' %
                           (traceback.format_exc()))
        return str_req

    def run(self):
        self.running = True
        while self.running:
            reqs = None
            url = None
            try:
                if self.scheduler:
                    reqs = self.scheduler.dispatch()
                if reqs:
                    for req in reqs:
                        req_str = self.req_to_string(req)
                        self.out_beanstalk.put(self.output_tube, req_str)
                        self.log.info(
                            'start_crawl\turl:%s\tdownload_type:%s\tsession:%s'
                            % (req.url, req.download_type, req.session_commit))
                time.sleep(self.selector_conf['select_seed_sleep_time'])
            except SocketError as e:
                time.sleep(30)
                self.log.error('beanstalk\tconnect\tfail\tstart\treconnect')
                try:
                    self.out_beanstalk.reconnect()
                    self.log.error('beanstalk\treconnect\tsuccess')
                except Exception as e:
                    self.log.error('beanstalk\treconnect\tfail')
            except Exception, e:
                self.log.error('crawled_failt\turl:%s\terror:%s' %
                               (url, traceback.format_exc()))
示例#18
0
#!/usr/bin/Python
# coding=utf-8
import sys

from thrift.protocol.TBinaryProtocol import TBinaryProtocol
from thrift.transport.TTransport import TMemoryBuffer

sys.path.append('..')
from bdp.i_crawler.i_downloader.ttypes import DownLoadRsp
from bdp.i_crawler.i_extractor.ttypes import PageParseInfo
from i_util.pybeanstalk import PyBeanstalk

if __name__ == '__main__':
    pybeanstalk = PyBeanstalk('101.201.102.37')
    try:
        extractor_info = PageParseInfo()
        body = pybeanstalk.reserve('extract_info').body
        tMemory_o = TMemoryBuffer(body)
        tBinaryProtocol_o = TBinaryProtocol(tMemory_o)
        extractor_info.read(tBinaryProtocol_o)
        print extractor_info
    except EOFError, e:
        print e
示例#19
0
class SelectProcessor(object):
    def __init__(self, conf):
        self.log = conf['log']
        self.conf = conf
        self.beanstalk_conf = conf['beanstalk_conf']
        try:
            self.mongo_client_web = PyMongo(
                self.conf['webpage_db']['host'],
                self.conf['webpage_db']['port'], self.conf['webpage_db']['db'],
                self.conf['webpage_db']['username'],
                self.conf['webpage_db']['password'])
            self.beanstalk_client = PyBeanstalk(self.beanstalk_conf['host'],
                                                self.beanstalk_conf['port'])
        except:
            self.log.error(traceback.format_exc())

    def get_download_rsp(self, result):
        url = result['url']
        content = result['content'].encode('utf-8')
        content_type = result.get('content_type', 'text/html')
        page_size = len(content)
        return DownLoadRsp(url=url,
                           download_time=int(time.time()),
                           status=0,
                           content_type=content_type,
                           page_size=page_size,
                           elapsed=100,
                           content=content,
                           redirect_url=url,
                           src_type='webpage',
                           http_code=200)

    # 通过url_format批量查询,并发送到队列
    def select_webpage(self, site, url_format, limit, start, extra_filter):
        try:
            collection_names = self.mongo_client_web.get_collection_names()
            #i_util中需提供一个函数计算主域
            domain = ""
            for collection_name in collection_names:
                prefix_domain = "." + collection_name
                if site.endswith(collection_name) or site.endswith(
                        prefix_domain):
                    domain = collection_name
                    break
            if domain:
                item_cursor = self.mongo_client_web.select_by_url_format(
                    domain, site, url_format, limit, start, extra_filter)
                return item_cursor
        except:
            self.log.error(
                "select_webpage\tsite:{0}\turl_format\t{1}\terror:{2}".format(
                    site, url_format, traceback.format_exc()))
        self.log.info(
            "select_webpage\tfinish\tsite:{0}\turl_format:{1}".format(
                site, url_format))
        return None

    def select_webpage_to_mq(self, condition):
        url_format = condition.get('url_format', "")
        site = condition.get('site', "")
        limit = int(condition.get('limit', -1))
        start = int(condition.get('start', 0))
        extra_filter = condition.get('extra_filter', '{}')
        self.log.info(
            "select_webpage_mq\tstart\tsite:{0}\turl_format:{1}".format(
                site, url_format))
        req_num = 0
        all_num = start
        if site:
            item_cursor = self.select_webpage(site, url_format, limit, start,
                                              extra_filter)
            if item_cursor:
                download_time = ""
                for item in item_cursor:
                    download_time = item.get("download_time", "")
                    all_num += 1
                    if item.get('content'):
                        download_rsp = self.get_download_rsp(item)
                        download_str = self.to_string(download_rsp)
                        req_num += 1
                        self.beanstalk_client.put(
                            self.beanstalk_conf['output_tube'], download_str)
                    if all_num % 100 == 1:
                        #print url_format, all_num, req_num, (all_num % 100 == 1)
                        self.log.info(
                            "select_webpage_mq\trunning\tsite:{0}\turl_format:{1}\tall_num:{2}\treq_num:{3}\tdownload_time:{4}"
                            .format(site, url_format, all_num, req_num,
                                    download_time))
        self.log.info(
            "select_webpage_mq\tfinish\tsite:{0}\turl_format:{1}\treq_num:{2}".
            format(site, url_format, req_num))

    def select_webpage_to_list(self, condition):
        return None

    # 通过url查询单条数据,并发送到队列
    def select_webpage_by_url(self, url):
        self.log.info("select_webpage_by_url start\turl:{}".format(url))
        url = url_encode(url)
        download_result = DownLoadRsp(url=url,
                                      download_time=int(time.time()),
                                      status=1,
                                      content_type='text/html',
                                      page_size=0,
                                      elapsed=100,
                                      content=None,
                                      redirect_url=url,
                                      src_type='webpage',
                                      http_code=0)
        try:
            query_item = {'url': url}
            domain = get_url_info(url).get('domain')
            result = self.mongo_client_web.find_first(domain, query_item)
            if result and (result.get('content')):
                download_result = self.get_download_rsp(result)
        except:
            self.log.error("select_webpage_by_url\turl\t{0}\terror:{1}".format(
                url, traceback.format_exc()))
        self.log.info("select_webpage_by_url finish\turl:{}".format(url))
        return download_result

    def to_string(self, link_info):
        str_entity = None
        try:
            tMemory_b = TMemoryBuffer()
            tBinaryProtocol_b = TBinaryProtocol.TBinaryProtocol(tMemory_b)
            link_info.write(tBinaryProtocol_b)
            str_entity = tMemory_b.getvalue()
        except EOFError, e:
            self.log.warning("can't write LinkAttr to string")
        return str_entity
示例#20
0
class InputThread(threading.Thread):
    def __init__(self, conf, processor, proc_name= None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True
        self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port'])
        self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port'])
        self.input_tube = conf['beanstalk_conf']['input_tube']
        self.output_tube = conf['beanstalk_conf']['output_tube']

        self.log = conf['log']
        if not self.log:
            self.log = LogHandler("i_input_thread")
        self.processor = processor
        if self.processor is None:
            self.log.error("Processor not given !")
            raise Exception("Processor not given !")

        self.processor_pool = ThreadPool(conf['server'].get("process_thread_num", 1),\
                                         {},\
                                         int(conf['server'].get("process_thread_num", 1))
                                         )
        self.wlock = threading.Lock()

    def stop(self):
        self.log.warning("stop input_thread")
        self.running = False
        self.processor_pool.join_all()

    def run(self):
        job_num = 0
        self.running = True
        while self.running and self.input_tube:
            try:
                job = self.beanstalk.reserve(self.input_tube, 3)
                if not job is None:
                    job_num += 1
                    body = job.body
                    job.delete()
                    self.processor_pool.queue_task(self._on_task_start, (body,), self._on_task_finished)

            except SocketError as e:
                time.sleep(30)
                self.log.error('beanstalk\tconnect\tfail\tstart\treconnect')
                try:
                    self.beanstalk.reconnect()
                    self.out_beanstalk.reconnect()
                    self.log.error('beanstalk\treconnect\tsuccess')
                except Exception as e:
                    self.log.error('beanstalk\treconnect\tfail')
            except:
                self.log.error("not msg from:%s\tresult:%s" % (self.input_tube, str(traceback.format_exc())))

    def _on_task_start(self, task, **thread_locals):
        result = None
        try:
            result = self.processor.do_task(task)
        except Exception as e:
            self.log.error(e.message)
        return result

    def _on_task_finished(self, (task), **thread_locals):
        self.wlock.acquire()
        if task and isinstance(task, basestring):
            self._output_msg(task)
        elif isinstance(task, list):
            for message in task:
                self._output_msg(message)
        self.wlock.release()
示例#21
0
#!/usr/bin/Python
# coding=utf-8
import sys

from thrift.protocol.TBinaryProtocol import TBinaryProtocol
from thrift.transport.TTransport import TMemoryBuffer

sys.path.append('..')
from bdp.i_crawler.i_crawler_merge.ttypes import LinkAttr
from bdp.i_crawler.i_downloader.ttypes import DownLoadReq
from bdp.i_crawler.i_extractor.ttypes import PageParseInfo
from i_util.pybeanstalk import PyBeanstalk

if __name__ == '__main__':
    pybeanstalk = PyBeanstalk('10.25.114.50')
    try:
        #link_info = DownLoadReq();
        info = PageParseInfo()
        while True:
            #print pybeanstalk.stats_tube('download_req')
            job = pybeanstalk.reserve('online_extract_info')
            """
            tMemory_o = TMemoryBuffer(job.body)
            tBinaryProtocol_o = TBinaryProtocol(tMemory_o)
            info.read(tBinaryProtocol_o)
            d = vars(info)
            for k,v in d.items():
                print k,v
            """
            job.delete()
            #break;
示例#22
0
class InputThreadNew(threading.Thread):
    def __init__(self, conf, processor=None, proc_name=None):
        threading.Thread.__init__(self)
        self.running = True
        self.proc_name = proc_name  # Only for logging
        self.input_tube = conf['beanstalk_conf']['input_tube']
        self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'],
                                     conf['beanstalk_conf']['port'])
        self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'],
                                         conf['beanstalk_conf']['port'])
        self.output_tube = conf['beanstalk_conf']['output_tube']
        self.topic_output_tubes = {}
        self.topic_output_tubes.setdefault('default', [])
        """
            output_tube = ["default_out", "only_special_out:1,2,3:exclusive", "special_out:4", ":5:exclusive"]
            topic_id:1,2,3只会用到only_special_out
            topic_id:4 会进入special_out和default_out
            topic_id:5 不会进入队列
            topic_id:else 用用default_out队列
        """
        if type(self.output_tube) == list:
            for tube_def in self.output_tube:
                tube_def = tube_def.strip()
                if len(tube_def.split(":")) < 2:
                    self.topic_output_tubes['default'].append(tube_def)
                else:
                    elements = [a.strip() for a in tube_def.split(':')]
                    tube_name = elements[0]
                    topic_ids = [
                        int(a.strip()) for a in elements[1].split(',')
                    ]
                    exclusive = False
                    if len(elements) == 3 and elements[2] == 'exclusive':
                        exclusive = True
                    for topic_id in topic_ids:
                        self.topic_output_tubes.setdefault(topic_id, [])
                        self.topic_output_tubes[topic_id].append(
                            (tube_name, exclusive))
        else:
            self.topic_output_tubes['default'].append(self.output_tube)

        self.log = log
        if processor is None:
            log.error("Processor not given !")
            raise Exception("Processor not given !")
        else:
            self.processor = processor

    def stop(self):
        self.log.warning("stop input thread")
        self.running = False

    def run(self):
        log.debug("starting input thread")
        job_num = 0
        while self.running and self.input_tube:
            try:
                job = self.beanstalk.reserve(self.input_tube, 3)
                if job:
                    job_num += 1
                    body = job.body
                    resp = None
                    job.delete()
                    if self.processor is not None:
                        topic_id = None
                        try:
                            if type(self.processor).__name__ in (
                                    'ExtractorProccessor',
                                    'SingleSrcMergerProccessor'):
                                resp, topic_id = self.processor.do_task(body)
                            else:
                                resp = self.processor.do_task(body)
                        except Exception, e:
                            log.error("Process failed. " +
                                      traceback.format_exc())
                        if resp is not None:
                            self.output_msg(resp, topic_id)
                else:
                    self.log.debug(current_process().name +
                                   " : no msg from : %s" % (self.input_tube))
            except SocketError as e:
                time.sleep(30)
                self.log.error('beanstalk\tconnect\tfail\tstart\treconnect')
                try:
                    self.beanstalk.reconnect()
                    self.out_beanstalk.reconnect()
                    self.log.error('beanstalk\treconnect\tsuccess')
                except Exception as e:
                    self.log.error('beanstalk\treconnect\tfail')
示例#23
0
import os
import json
import traceback
import time
from thrift.protocol.TBinaryProtocol import TBinaryProtocol
from thrift.transport.TTransport import TMemoryBuffer

sys.path.append('..')
from i_util.pybeanstalk import PyBeanstalk
from bdp.i_crawler.i_downloader.ttypes import DownLoadReq
from i_util.logs import LogHandler
import config

log = LogHandler('re_crawler', console_out=True)
beanstalk_conf = config.beanstalk_conf
beanstalk_client = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])


def req_to_string(req):
    str_req = ""
    try:
        tMemory_b = TMemoryBuffer()
        tBinaryProtocol_b = TBinaryProtocol(tMemory_b)
        req.write(tBinaryProtocol_b)
        str_req = tMemory_b.getvalue()
    except:
        log.error('crawled_failt\terror:%s' % (traceback.format_exc()))
    return str_req


def create_download_req(url,
class DownloaderProccessor(NormalProccessor):
    def __init__(self, log, conf):
        self.log = log
        self.conf = conf

        assert log is not None
        assert isinstance(conf, dict)

        self.type_extractor_map = self.conf['type_extractor_map']
        self.smart_proxy_url = self.conf['smart_proxy_url']

        self.out_beanstalk = PyBeanstalk(self.conf['beanstalk_conf']['host'], self.conf['beanstalk_conf']['port'])
        self.output_tube_scheduler = self.conf['beanstalk_conf']['output_tube_scheduler']

    def to_string(self, download_rsp):
        str_rsq = None
        try:
            t_memory_b = TMemoryBuffer()
            t_binary_protocol_b = TBinaryProtocol(t_memory_b)
            download_rsp.write(t_binary_protocol_b)
            str_rsq = t_memory_b.getvalue()
            # self.log.info('data-length is {}'.format(str(len(str_rsq))))
        except EOFError:
            self.log.warning("cann't write PageParseInfo to string")
        return str_rsq

    def do_task(self, body):
        try:
            download_req = json.loads(body)
            self.log.info("request_msg\t%s" % download_req)

            target_extractor_id = self.type_extractor_map[download_req['_type']]

            name = download_req['name'].encode('utf-8')
            target_url = 'http://%(site)s/gongshang_search?%(query)s' % {
                'site': prov_site_map[download_req['province']],
                'query': urllib.urlencode({
                    'name': name,
                    'original_query': json.dumps(download_req)
                })
            }
            self.log.info('请求代理企业名称: name = {name}'.format(name=name))

            response = requests.get(target_url, proxies={'http': self.smart_proxy_url})
            if response.status_code != 200:
                download_rsp = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, )
                return self.to_string(download_rsp)

            self.log.debug(response.text)

            resp_json = response.json()

            url = resp_json['url']

            # 组装DownloadRsp
            resp = dict()
            resp['url'] = str_obj(url)
            resp['download_time'] = resp_json.get('entitySrcDownloadTime', 0)
            resp['pages'] = []
            resp['content'] = str_obj(resp_json['html'])
            if resp['content'] is None:
                resp['content'] = '<html></html>'
            resp['data_extends'] = str_obj(json.dumps(resp_json['entity']))
            resp['parse_extends'] = str_obj(json.dumps({"parser_id": target_extractor_id}))
            resp['page_size'] = len(resp['content'])
            resp['content_type'] = 'text/html'
            resp['src_type'] = 'webpage'
            # resp['info'] = request.info
            # resp['scheduler'] = request.scheduler
            # resp['parse_extends'] = request.parse_extends
            resp['http_code'] = response.status_code
            resp['elapsed'] = int(response.elapsed.microseconds / 1000.0)
            resp['status'] = CrawlStatus.CRAWL_SUCCESS
            download_rsp = DownLoadRsp(**resp)

            self.log.info('发送到解析器的 name = {name} url = {url}'.format(name=name, url=resp['url']))

            # self.log.info(download_rsp)

            # 写给工商调度
            company_name = resp_json['entity'].get('company')
            self.out_beanstalk.put(self.output_tube_scheduler, json.dumps({
                'company': company_name,
                'crawl_online': resp_json['crawlStatus'].get('crawl_online'),
                'crawl_online_time': resp_json['crawlStatus'].get('crawl_online_time'),
                'query': resp_json['crawlSeed'],
            }))
            self.log.info('发送企业名称到工商调度消息队列: comapny = {company}'.format(company=company_name.encode('utf-8')))
            return self.to_string(download_rsp)
        except Exception as err:
            self.log.error("process failed, err[%s]" % (repr(err)))
            self.log.exception(err)

            download_rsp = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, )
            return self.to_string(download_rsp)
            # return download_rsp

    def do_output(self, body):
        return True
示例#25
0
#!/usr/bin/Python
# coding=utf-8
import sys

from thrift.protocol.TBinaryProtocol import TBinaryProtocol
from thrift.transport.TTransport import TMemoryBuffer

sys.path.append('..')
from bdp.i_crawler.i_extractor.ttypes import PageParseInfo
from i_util.pybeanstalk import PyBeanstalk

if __name__ == '__main__':
    pybeanstalk = PyBeanstalk('101.201.100.58')
    try:
        rsp_info = PageParseInfo()
        job = pybeanstalk.reserve('extract_info_ws')
        # while True:
        if job:
            tMemory_o = TMemoryBuffer(job.body)
            tBinaryProtocol_o = TBinaryProtocol(tMemory_o)
            rsp_info.read(tBinaryProtocol_o)
            d = vars(rsp_info)
            print d
            for k, v in d.items():
                print k, v
            job.delete()
    except EOFError, e:
        print e