def put_beanstalked(beanstalk_conf, log, rsp): beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) tube = beanstalk_conf['input_tube'] str_page_info = to_string(log, rsp) try: beanstalk.put(tube, str_page_info) log.info('beanstalk\turl:%s\ttube:%s' % (rsp.url, tube)) except Exception as e: log.info('beanstalk put error url:' + rsp.url + '\ttube:' + tube)
def deliver_req(): out_beanstalk = PyBeanstalk('172.18.180.223', 11300) while True: try: priority, reqs = index_queue.get_nowait() req_str = req_to_string(reqs) out_beanstalk.put('online_download_req', req_str) except Empty: continue time.sleep(6)
class PutBeanstaldServer(threading.Thread): def __init__(self, beanstalk_conf, log): self._queue = Queue() self._log = log self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.beanstalk_conf = beanstalk_conf threading.Thread.__init__(self) self.daemon = True self.running = True def to_string(self, page_info): str_page_info = None try: tMemory_b = TMemoryBuffer() tBinaryProtocol_b = TBinaryServerProtocol(tMemory_b) page_info.write(tBinaryProtocol_b) str_page_info = tMemory_b.getvalue() except EOFError as e: self._log.warning("cann't write data to string") return str_page_info def put_beanstalkd(self, tube_name, obj): str_page_info = self.to_string(obj) try: self.beanstalk.put(tube_name, str_page_info) self._log.info('put beanstalk \ttube:%s success' % (tube_name, )) except SocketError as e: self._log.error('beanstalk connect failed, {}'.format(e.message)) self.beanstalk = PyBeanstalk(self.beanstalk_conf['host'], self.beanstalk_conf['port']) except Exception as e: self._log.info('beanstalk put tube{} error {}'.format( tube_name, str(traceback.format_exc()))) def run(self): while True: record = self._queue.get() self._build_record_and_put(record) def get_tube_by_name(self, tube_name): return self.beanstalk_conf.get(tube_name, None) def _build_record_and_put(self, data): tube_name = data.get('tube_name', None) if not tube_name: return obj = data.get('obj', None) if not obj: return self.put_beanstalkd(tube_name, obj) def save_record(self, data): self._queue.put(data)
def thrput_task(): input_tube='download_req' beanstalk = PyBeanstalk('101.201.102.37', 11300) client,transport=getclient() cnt=0 start=time.time() suma=100 while suma: suma-=1 for i in pro.keys(): try: req=getreq(proa=i) str_page_info = to_string(req) beanstalk.put(input_tube, str_page_info) cnt+=1 except Exception as e: print e.message print ('usetime:{}'.format(time.time()-start)) closetransport(transport)
class CrawlSelector(threading.Thread): def __init__(self, log, selector_conf, beanstalk_conf, scheduler=None): threading.Thread.__init__(self) self.daemon = True self.running = False self.log = log # 下载统计信息 self.site_static = {} self.scheduler = scheduler self.download_req_num = 0 # 下载器配置信息 # self.downloaders = [] self.downloader_num = 0 # self.downloader_conf = downloader_conf # for downloader in self.downloader_conf: # try: # self.downloaders.append(ThriftDownloader(downloader['host'], downloader['port'])) # self.downloader_num += 1 # except Exception, e: # self.log.error('Add_downloader\t' + traceback.format_exc()) # 选择器配置 self.selector_conf = selector_conf # beanstalk 队列设置 self.beanstalk_conf = beanstalk_conf self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.output_tube = beanstalk_conf['output_tube'] self.wlock = threading.Lock() def req_to_string(self, req): str_req = "" try: tMemory_b = TMemoryBuffer() tBinaryProtocol_b = TBinaryProtocol(tMemory_b) req.write(tBinaryProtocol_b) str_req = tMemory_b.getvalue() except: self.log.error('crawled_failt\terror:%s' % (traceback.format_exc())) return str_req def run(self): self.running = True while self.running: reqs = None url = None try: if self.scheduler: reqs = self.scheduler.dispatch() if reqs: for req in reqs: req_str = self.req_to_string(req) self.out_beanstalk.put(self.output_tube, req_str) self.log.info( 'start_crawl\turl:%s\tdownload_type:%s\tsession:%s' % (req.url, req.download_type, req.session_commit)) time.sleep(self.selector_conf['select_seed_sleep_time']) except SocketError as e: time.sleep(30) self.log.error('beanstalk\tconnect\tfail\tstart\treconnect') try: self.out_beanstalk.reconnect() self.log.error('beanstalk\treconnect\tsuccess') except Exception as e: self.log.error('beanstalk\treconnect\tfail') except Exception, e: self.log.error('crawled_failt\turl:%s\terror:%s' % (url, traceback.format_exc()))
class SelectProcessor(object): def __init__(self, conf): self.log = conf['log'] self.conf = conf self.beanstalk_conf = conf['beanstalk_conf'] try: self.mongo_client_web = PyMongo( self.conf['webpage_db']['host'], self.conf['webpage_db']['port'], self.conf['webpage_db']['db'], self.conf['webpage_db']['username'], self.conf['webpage_db']['password']) self.beanstalk_client = PyBeanstalk(self.beanstalk_conf['host'], self.beanstalk_conf['port']) except: self.log.error(traceback.format_exc()) def get_download_rsp(self, result): url = result['url'] content = result['content'].encode('utf-8') content_type = result.get('content_type', 'text/html') page_size = len(content) return DownLoadRsp(url=url, download_time=int(time.time()), status=0, content_type=content_type, page_size=page_size, elapsed=100, content=content, redirect_url=url, src_type='webpage', http_code=200) # 通过url_format批量查询,并发送到队列 def select_webpage(self, site, url_format, limit, start, extra_filter): try: collection_names = self.mongo_client_web.get_collection_names() #i_util中需提供一个函数计算主域 domain = "" for collection_name in collection_names: prefix_domain = "." + collection_name if site.endswith(collection_name) or site.endswith( prefix_domain): domain = collection_name break if domain: item_cursor = self.mongo_client_web.select_by_url_format( domain, site, url_format, limit, start, extra_filter) return item_cursor except: self.log.error( "select_webpage\tsite:{0}\turl_format\t{1}\terror:{2}".format( site, url_format, traceback.format_exc())) self.log.info( "select_webpage\tfinish\tsite:{0}\turl_format:{1}".format( site, url_format)) return None def select_webpage_to_mq(self, condition): url_format = condition.get('url_format', "") site = condition.get('site', "") limit = int(condition.get('limit', -1)) start = int(condition.get('start', 0)) extra_filter = condition.get('extra_filter', '{}') self.log.info( "select_webpage_mq\tstart\tsite:{0}\turl_format:{1}".format( site, url_format)) req_num = 0 all_num = start if site: item_cursor = self.select_webpage(site, url_format, limit, start, extra_filter) if item_cursor: download_time = "" for item in item_cursor: download_time = item.get("download_time", "") all_num += 1 if item.get('content'): download_rsp = self.get_download_rsp(item) download_str = self.to_string(download_rsp) req_num += 1 self.beanstalk_client.put( self.beanstalk_conf['output_tube'], download_str) if all_num % 100 == 1: #print url_format, all_num, req_num, (all_num % 100 == 1) self.log.info( "select_webpage_mq\trunning\tsite:{0}\turl_format:{1}\tall_num:{2}\treq_num:{3}\tdownload_time:{4}" .format(site, url_format, all_num, req_num, download_time)) self.log.info( "select_webpage_mq\tfinish\tsite:{0}\turl_format:{1}\treq_num:{2}". format(site, url_format, req_num)) def select_webpage_to_list(self, condition): return None # 通过url查询单条数据,并发送到队列 def select_webpage_by_url(self, url): self.log.info("select_webpage_by_url start\turl:{}".format(url)) url = url_encode(url) download_result = DownLoadRsp(url=url, download_time=int(time.time()), status=1, content_type='text/html', page_size=0, elapsed=100, content=None, redirect_url=url, src_type='webpage', http_code=0) try: query_item = {'url': url} domain = get_url_info(url).get('domain') result = self.mongo_client_web.find_first(domain, query_item) if result and (result.get('content')): download_result = self.get_download_rsp(result) except: self.log.error("select_webpage_by_url\turl\t{0}\terror:{1}".format( url, traceback.format_exc())) self.log.info("select_webpage_by_url finish\turl:{}".format(url)) return download_result def to_string(self, link_info): str_entity = None try: tMemory_b = TMemoryBuffer() tBinaryProtocol_b = TBinaryProtocol.TBinaryProtocol(tMemory_b) link_info.write(tBinaryProtocol_b) str_entity = tMemory_b.getvalue() except EOFError, e: self.log.warning("can't write LinkAttr to string") return str_entity
class DownloaderProccessor(NormalProccessor): def __init__(self, log, conf): self.log = log self.conf = conf assert log is not None assert isinstance(conf, dict) self.type_extractor_map = self.conf['type_extractor_map'] self.smart_proxy_url = self.conf['smart_proxy_url'] self.out_beanstalk = PyBeanstalk(self.conf['beanstalk_conf']['host'], self.conf['beanstalk_conf']['port']) self.output_tube_scheduler = self.conf['beanstalk_conf']['output_tube_scheduler'] def to_string(self, download_rsp): str_rsq = None try: t_memory_b = TMemoryBuffer() t_binary_protocol_b = TBinaryProtocol(t_memory_b) download_rsp.write(t_binary_protocol_b) str_rsq = t_memory_b.getvalue() # self.log.info('data-length is {}'.format(str(len(str_rsq)))) except EOFError: self.log.warning("cann't write PageParseInfo to string") return str_rsq def do_task(self, body): try: download_req = json.loads(body) self.log.info("request_msg\t%s" % download_req) target_extractor_id = self.type_extractor_map[download_req['_type']] name = download_req['name'].encode('utf-8') target_url = 'http://%(site)s/gongshang_search?%(query)s' % { 'site': prov_site_map[download_req['province']], 'query': urllib.urlencode({ 'name': name, 'original_query': json.dumps(download_req) }) } self.log.info('请求代理企业名称: name = {name}'.format(name=name)) response = requests.get(target_url, proxies={'http': self.smart_proxy_url}) if response.status_code != 200: download_rsp = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, ) return self.to_string(download_rsp) self.log.debug(response.text) resp_json = response.json() url = resp_json['url'] # 组装DownloadRsp resp = dict() resp['url'] = str_obj(url) resp['download_time'] = resp_json.get('entitySrcDownloadTime', 0) resp['pages'] = [] resp['content'] = str_obj(resp_json['html']) if resp['content'] is None: resp['content'] = '<html></html>' resp['data_extends'] = str_obj(json.dumps(resp_json['entity'])) resp['parse_extends'] = str_obj(json.dumps({"parser_id": target_extractor_id})) resp['page_size'] = len(resp['content']) resp['content_type'] = 'text/html' resp['src_type'] = 'webpage' # resp['info'] = request.info # resp['scheduler'] = request.scheduler # resp['parse_extends'] = request.parse_extends resp['http_code'] = response.status_code resp['elapsed'] = int(response.elapsed.microseconds / 1000.0) resp['status'] = CrawlStatus.CRAWL_SUCCESS download_rsp = DownLoadRsp(**resp) self.log.info('发送到解析器的 name = {name} url = {url}'.format(name=name, url=resp['url'])) # self.log.info(download_rsp) # 写给工商调度 company_name = resp_json['entity'].get('company') self.out_beanstalk.put(self.output_tube_scheduler, json.dumps({ 'company': company_name, 'crawl_online': resp_json['crawlStatus'].get('crawl_online'), 'crawl_online_time': resp_json['crawlStatus'].get('crawl_online_time'), 'query': resp_json['crawlSeed'], })) self.log.info('发送企业名称到工商调度消息队列: comapny = {company}'.format(company=company_name.encode('utf-8'))) return self.to_string(download_rsp) except Exception as err: self.log.error("process failed, err[%s]" % (repr(err))) self.log.exception(err) download_rsp = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, ) return self.to_string(download_rsp) # return download_rsp def do_output(self, body): return True