Exemplo n.º 1
0
    def __init__(self, beanstalk_conf, log=None, process_pool=None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True
        self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.input_tube = beanstalk_conf['input_tube']
        self.output_tube = beanstalk_conf['output_tube']
        self.log = log
        if not self.log:
            self.log = LogHandler("i_input_thread")

        self.process_pool = process_pool
        self.wlock = threading.Lock()
Exemplo n.º 2
0
    def __init__(self, log, selector_conf, beanstalk_conf, scheduler=None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = False
        self.log = log
        # 下载统计信息

        self.site_static = {}
        self.scheduler = scheduler
        self.download_req_num = 0
        # 下载器配置信息
        # self.downloaders = []
        self.downloader_num = 0
        # self.downloader_conf = downloader_conf

        # for downloader in self.downloader_conf:
        #     try:
        #         self.downloaders.append(ThriftDownloader(downloader['host'], downloader['port']))
        #         self.downloader_num += 1
        #     except Exception, e:
        #         self.log.error('Add_downloader\t' + traceback.format_exc())
        # 选择器配置
        self.selector_conf = selector_conf
        # beanstalk 队列设置
        self.beanstalk_conf = beanstalk_conf
        self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'],
                                         beanstalk_conf['port'])
        self.output_tube = beanstalk_conf['output_tube']
        self.wlock = threading.Lock()
Exemplo n.º 3
0
 def __init__(self, conf, convert, select_handler):
     self.conf = conf
     self.log = conf['log']
     self.convert = convert
     self.select_handler = select_handler
     self.beanstalk = PyBeanstalk(
         conf.get('beanstalk_conf').get('host'),
         conf.get('beanstalk_conf').get('port'))
Exemplo n.º 4
0
    def __init__(self, conf, processor=None, proc_name=None):
        threading.Thread.__init__(self)
        self.running = True
        self.proc_name = proc_name  # Only for logging
        self.input_tube = conf['beanstalk_conf']['input_tube']
        self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'],
                                     conf['beanstalk_conf']['port'])
        self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'],
                                         conf['beanstalk_conf']['port'])
        self.output_tube = conf['beanstalk_conf']['output_tube']
        self.topic_output_tubes = {}
        self.topic_output_tubes.setdefault('default', [])
        """
            output_tube = ["default_out", "only_special_out:1,2,3:exclusive", "special_out:4", ":5:exclusive"]
            topic_id:1,2,3只会用到only_special_out
            topic_id:4 会进入special_out和default_out
            topic_id:5 不会进入队列
            topic_id:else 用用default_out队列
        """
        if type(self.output_tube) == list:
            for tube_def in self.output_tube:
                tube_def = tube_def.strip()
                if len(tube_def.split(":")) < 2:
                    self.topic_output_tubes['default'].append(tube_def)
                else:
                    elements = [a.strip() for a in tube_def.split(':')]
                    tube_name = elements[0]
                    topic_ids = [
                        int(a.strip()) for a in elements[1].split(',')
                    ]
                    exclusive = False
                    if len(elements) == 3 and elements[2] == 'exclusive':
                        exclusive = True
                    for topic_id in topic_ids:
                        self.topic_output_tubes.setdefault(topic_id, [])
                        self.topic_output_tubes[topic_id].append(
                            (tube_name, exclusive))
        else:
            self.topic_output_tubes['default'].append(self.output_tube)

        self.log = log
        if processor is None:
            log.error("Processor not given !")
            raise Exception("Processor not given !")
        else:
            self.processor = processor
Exemplo n.º 5
0
 def __init__(self, beanstalk_conf, log):
     self._queue = Queue()
     self._log = log
     self.beanstalk = PyBeanstalk(beanstalk_conf['host'],
                                  beanstalk_conf['port'])
     self.beanstalk_conf = beanstalk_conf
     threading.Thread.__init__(self)
     self.daemon = True
     self.running = True
Exemplo n.º 6
0
def put_beanstalked(beanstalk_conf, log, rsp):
    beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
    tube = beanstalk_conf['input_tube']
    str_page_info = to_string(log, rsp)
    try:
        beanstalk.put(tube, str_page_info)
        log.info('beanstalk\turl:%s\ttube:%s' % (rsp.url, tube))
    except Exception as e:
        log.info('beanstalk put error url:' + rsp.url + '\ttube:' + tube)
Exemplo n.º 7
0
 def __init__(self, site, url_pattern=".*", test=True, parser_id=None):
     global logger
     self.beanstalk = PyBeanstalk("Crawler-Downloader1:Crawler-Downloader2", 11300)
     self.output_tube = 'download_rsp'
     self.output_queue = queue.Queue(maxsize = 1000)
     self.mongo_client = pymongo.MongoClient('Crawler-DataServer1', 40042)
     self.site = site
     self.url_pattern = url_pattern
     self.test = test
     self.parser_id = parser_id
Exemplo n.º 8
0
def deliver_req():
    out_beanstalk = PyBeanstalk('172.18.180.223', 11300)
    while True:
        try:
            priority, reqs = index_queue.get_nowait()
            req_str = req_to_string(reqs)
            out_beanstalk.put('online_download_req', req_str)
        except Empty:
            continue
            time.sleep(6)
Exemplo n.º 9
0
    def __init__(self, log, conf):
        self.log = log
        self.conf = conf

        assert log is not None
        assert isinstance(conf, dict)

        self.type_extractor_map = self.conf['type_extractor_map']
        self.smart_proxy_url = self.conf['smart_proxy_url']

        self.out_beanstalk = PyBeanstalk(self.conf['beanstalk_conf']['host'], self.conf['beanstalk_conf']['port'])
        self.output_tube_scheduler = self.conf['beanstalk_conf']['output_tube_scheduler']
Exemplo n.º 10
0
 def put_beanstalkd(self, tube_name, obj):
     str_page_info = self.to_string(obj)
     try:
         self.beanstalk.put(tube_name, str_page_info)
         self._log.info('put beanstalk \ttube:%s success' % (tube_name, ))
     except SocketError as e:
         self._log.error('beanstalk connect failed, {}'.format(e.message))
         self.beanstalk = PyBeanstalk(self.beanstalk_conf['host'],
                                      self.beanstalk_conf['port'])
     except Exception as e:
         self._log.info('beanstalk put tube{} error {}'.format(
             tube_name, str(traceback.format_exc())))
Exemplo n.º 11
0
    def __init__(self, conf, processor, proc_name= None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True
        self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port'])
        self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port'])
        self.input_tube = conf['beanstalk_conf']['input_tube']
        self.output_tube = conf['beanstalk_conf']['output_tube']

        self.log = conf['log']
        if not self.log:
            self.log = LogHandler("i_input_thread")
        self.processor = processor
        if self.processor is None:
            self.log.error("Processor not given !")
            raise Exception("Processor not given !")

        self.processor_pool = ThreadPool(conf['server'].get("process_thread_num", 1),\
                                         {},\
                                         int(conf['server'].get("process_thread_num", 1))
                                         )
        self.wlock = threading.Lock()
Exemplo n.º 12
0
 def __init__(self, conf):
     self.log = conf['log']
     self.conf = conf
     self.beanstalk_conf = conf['beanstalk_conf']
     try:
         self.mongo_client_web = PyMongo(
             self.conf['webpage_db']['host'],
             self.conf['webpage_db']['port'], self.conf['webpage_db']['db'],
             self.conf['webpage_db']['username'],
             self.conf['webpage_db']['password'])
         self.beanstalk_client = PyBeanstalk(self.beanstalk_conf['host'],
                                             self.beanstalk_conf['port'])
     except:
         self.log.error(traceback.format_exc())
Exemplo n.º 13
0
def thrput_task():
    input_tube='download_req'
    beanstalk = PyBeanstalk('101.201.102.37', 11300)
    client,transport=getclient()
    cnt=0
    start=time.time()
    suma=100
    while suma:
        suma-=1
        for i in pro.keys():
            try:
                req=getreq(proa=i)
                str_page_info = to_string(req)
                beanstalk.put(input_tube, str_page_info)
                cnt+=1
            except Exception as e:
                print e.message
        print ('usetime:{}'.format(time.time()-start))



    closetransport(transport)
Exemplo n.º 14
0
import os
import json
import traceback
import time
from thrift.protocol.TBinaryProtocol import TBinaryProtocol
from thrift.transport.TTransport import TMemoryBuffer

sys.path.append('..')
from i_util.pybeanstalk import PyBeanstalk
from bdp.i_crawler.i_downloader.ttypes import DownLoadReq
from i_util.logs import LogHandler
import config

log = LogHandler('re_crawler', console_out=True)
beanstalk_conf = config.beanstalk_conf
beanstalk_client = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])


def req_to_string(req):
    str_req = ""
    try:
        tMemory_b = TMemoryBuffer()
        tBinaryProtocol_b = TBinaryProtocol(tMemory_b)
        req.write(tBinaryProtocol_b)
        str_req = tMemory_b.getvalue()
    except:
        log.error('crawled_failt\terror:%s' % (traceback.format_exc()))
    return str_req


def create_download_req(url,
Exemplo n.º 15
0
#!/usr/bin/Python
# coding=utf-8
import sys

from thrift.protocol.TBinaryProtocol import TBinaryProtocol
from thrift.transport.TTransport import TMemoryBuffer

sys.path.append('..')
from bdp.i_crawler.i_downloader.ttypes import DownLoadRsp
from bdp.i_crawler.i_extractor.ttypes import PageParseInfo
from i_util.pybeanstalk import PyBeanstalk

if __name__ == '__main__':
    pybeanstalk = PyBeanstalk('101.201.102.37')
    try:
        extractor_info = PageParseInfo()
        body = pybeanstalk.reserve('extract_info').body
        tMemory_o = TMemoryBuffer(body)
        tBinaryProtocol_o = TBinaryProtocol(tMemory_o)
        extractor_info.read(tBinaryProtocol_o)
        print extractor_info
    except EOFError, e:
        print e
Exemplo n.º 16
0
#!/usr/bin/Python
# coding=utf-8
import sys

from thrift.protocol.TBinaryProtocol import TBinaryProtocol
from thrift.transport.TTransport import TMemoryBuffer

sys.path.append('..')
from bdp.i_crawler.i_extractor.ttypes import PageParseInfo
from i_util.pybeanstalk import PyBeanstalk

if __name__ == '__main__':
    pybeanstalk = PyBeanstalk('101.201.100.58')
    try:
        rsp_info = PageParseInfo()
        job = pybeanstalk.reserve('extract_info_ws')
        # while True:
        if job:
            tMemory_o = TMemoryBuffer(job.body)
            tBinaryProtocol_o = TBinaryProtocol(tMemory_o)
            rsp_info.read(tBinaryProtocol_o)
            d = vars(rsp_info)
            print d
            for k, v in d.items():
                print k, v
            job.delete()
    except EOFError, e:
        print e
Exemplo n.º 17
0
#!/usr/bin/Python
# coding=utf-8
import sys

from thrift.protocol.TBinaryProtocol import TBinaryProtocol
from thrift.transport.TTransport import TMemoryBuffer

sys.path.append('..')
from bdp.i_crawler.i_crawler_merge.ttypes import LinkAttr
from bdp.i_crawler.i_downloader.ttypes import DownLoadReq
from bdp.i_crawler.i_extractor.ttypes import PageParseInfo
from i_util.pybeanstalk import PyBeanstalk

if __name__ == '__main__':
    pybeanstalk = PyBeanstalk('10.25.114.50')
    try:
        #link_info = DownLoadReq();
        info = PageParseInfo()
        while True:
            #print pybeanstalk.stats_tube('download_req')
            job = pybeanstalk.reserve('online_extract_info')
            """
            tMemory_o = TMemoryBuffer(job.body)
            tBinaryProtocol_o = TBinaryProtocol(tMemory_o)
            info.read(tBinaryProtocol_o)
            d = vars(info)
            for k,v in d.items():
                print k,v
            """
            job.delete()
            #break;