def crawling_search(self): try: search_result_text = self.get_search_result() search_result_list = self.analyze_search_result(search_result_text) except RequestError: print('list', RequestError) except AnalyzeError: print('list', AnalyzeError) else: total = 0 for detail in search_result_list: try: result_detail = self.auto_news_main_content(detail['url'], keyword=self.req_params['keyword']) except RequestError: print('cnt', RequestError) else: self.result['result']['item'] = result_detail try: send(self.result) # print(self.result) time.sleep(0.1) total += 1 if total >= 100: break except Exception: print('send errors')
def crawling_news(self): try: news_last_list_text = self.get_news_last_list() news_last_list = self.analyze_news_last_list(news_last_list_text) except RequestError: print('list', RequestError) except AnalyzeError: print('list', AnalyzeError) else: total = 0 for detail in news_last_list: try: result_detail = self.auto_news_main_content(detail['url']) except RequestError: print('cnt', RequestError) else: self.result['result']['item'] = result_detail try: send(self.result) # print(self.result) time.sleep(0.1) total += 1 if total >= 20: break except Exception: print('send errors')
def execute_by_message(message): msg = message method, params = method_and_params_by_message(msg) raw_proxy = params.get('proxy') params['proxy'] = proxy_by_message(msg) msg.pop('params', None) p = dict( task_msg=msg, ) p.update(params) try: result = method(**p) except Exception as e: s = traceback.format_exc() log(f'execute task error: {msg}\n{e}\n{s}') result = dict( code=1899999, data=dict( error=str(e), detail=s, ), msg='爬虫未知错误', ) release_proxy(raw_proxy) params['proxy'] = raw_proxy data = dict( msg_type='method_result', result=result, task_msg=msg, ) send(data)
def crawling_news(self): try: current_page = 1 news_last_list_all = list() for _ in range(4): news_last_list_text = self.get_news_last_list(current_page) news_last_list = self.analyze_news_last_list(news_last_list_text) news_last_list_all.extend(news_last_list) current_page += 1 except RequestError: print('list', RequestError) except AnalyzeError: print('list', AnalyzeError) else: total = 0 for detail in news_last_list_all: try: result_detail = self.get_news_result_cnt(detail['url']) except RequestError: print('cnt', RequestError) else: self.result['result']['item'] = result_detail try: send(self.result) # print(self.result) time.sleep(0.1) total += 1 if total >= 100: break except Exception: print('send errors')
def pick(**kw): msg = kw.get('task_msg') for i in range(5): data = dict( msg_type='scraped_data', result=dict(a=i, ), task_msg=msg, ) send(data)
def async_request(channel, method, callback, **params): msg = dict( msg_type='request', channel=channel, method=method, params=params, ) send(msg) subscribe(channel, callback)
def deal_message(msg): try: log(f'dealing msg {msg}') execute_by_message(msg) except Exception as e: s = traceback.format_exc() log(f'deal msg error: {msg}\n{e}\n{s}') data = dict( msg_type='deal_msg_error', error_detail=s, task_msg=msg, ) send(data)
def execute_by_message(message): msg = message method, params = method_and_params_by_message(msg) raw_proxy = params.get('proxy') params['proxy'] = proxy_by_message(msg) p = dict( task_msg=msg, ) p.update(params) result = method(**p) release_proxy(raw_proxy) params['proxy'] = raw_proxy data = dict( msg_type='method_result', result=result, task_msg=msg, ) send(data)
def watch(task_manager=None): log('worker start watch') data = dict( msg_type='worker_start', time=time.time(), ) send(data) while True: try: url = send.url resp = requests.get(url) msg = resp.json() if msg.get('code') != 204: if task_manager is None: deal_message(msg) else: task_manager.submit(deal_message, msg) except Exception as e: s = traceback.format_exc() log(f'Unexpected error: {e}\n{s}') time.sleep(5)
def __watch(thread_pool_executor=None): executor = thread_pool_executor log('Worker start watch channel test_01') data = dict( msg_type='worker_start', time=time.time(), ) send(data) while True: try: url = send.url + '?channel=test_01' resp = requests.get(url) msg = resp.json() if msg.get('code') != 204: if executor is None: deal_message(msg) else: executor.submit(deal_message, msg) except Exception as e: s = traceback.format_exc() log(f'Unexpected error: {e}\n{s}') time.sleep(5)
def send_msg(msg): time.sleep(2) send(msg)