def unmarshal(self, context, response): result = response.text if result.find('请稍后重试') != -1: raise InterruptException(u'happen to risk,value:%s' % (result)) if result.find('apiStack') == -1: raise RetryException(u'may happen to risk,value:%s' % (result)) result = result.replace('mtopjsonp3(', '') result = ast.Str(result.replace(')', '')) result = self.parse_js(result) # print(result) # result = result.replace(':"\\"', ':"') # result = result.replace('\\""', '"') # result = result.replace('mtopjsonp2(', '') # result = result.replace(')', '') # result = result.replace('\\', '') # result = result.replace('"{', '{') # result = result.replace('}"', '}') # result = result.replace('"[', '[') # result = result.replace(']"', ']') result = json.loads(result) try: value_str = result['data']['apiStack'][0]['value'] result['data']['apiStack'][0]['value'] = json.loads(value_str) except Exception as e: logger.error('taobao detail page unmarshal error,result:%s', result) raise e return result
def __get_list_api_param(self, html, soup, param_name): try: pattern = re.compile(param_name + ":'(.*?)'", re.MULTILINE | re.DOTALL) script = soup.find("script", text=pattern) return pattern.search(script.text).group(1) except Exception as e: logger.error('list api params error,param_name%s,origin response:%s,exp:%s', param_name, html, e) raise e
def mongo_collection_scope(**kwargs): collection_name = kwargs.get('collection_name') try: collection = db.get_collection(name=collection_name) yield collection except: logger.error("failed to finish the mongo commit: %s", traceback.format_exc()) raise
def unmarshal(self, context, response): html = response.text try: rst = re.search("legalityToken=(.*?);", html).group(1) return rst except Exception as e: logger.error('legalityToken fetch error,origin response:%s,exp:%s', html, e) raise CookieExpiredException(e)
def _session_scope(engine, **kwargs): session = _Session(bind=engine, **kwargs) try: yield session session.commit() except: logger.error("failed to finish the commit: %s", traceback.format_exc()) session.rollback() raise finally: session.close()
def get_sign_js(): data_file = Path(__file__).parent.joinpath('../../../js/' + 'sign.js') if not data_file.exists(): return None try: text = data_file.read_text() return text # ctx = execjs.compile(text) # print(ctx.call('p', 'aaa')) logger.info('load sign.js success') except Exception as e: logger.error('load sign.js error,exp:%s', e) raise e
def on_error(self, context, exp): task = context.get(Context.KEY_CURRENT_TASK, '') good = context.get(Context.KEY_GOOD_DICT, dict()) task_id = None data = None if task: task_id = task.id data = task.raw_data logger.error( u'context key:[%s],action:[%s],task_id:[%s],good:[%s],execute error,data:%s,exception:%s', context.context_key, self.__class__.__name__, task_id, good, data, exp)
def on_error(self, context, exp): task = context.get(Context.KEY_CURRENT_TASK, '') good = context.get(Context.KEY_GOOD_DICT, dict()) hot_rank_result = context.get( Context.KEY_SYCM_PRODUCT_PROD_HOT_RANK_RESULT) task_id = None data = None if task: task_id = task.id data = task.raw_data logger.error( u'context key:[%s],action:[%s],task_id:[%s],good:[%s],execute error,data:%s,origin data:%s,exception:%s', context.context_key, self.__class__.__name__, task_id, good, data, hot_rank_result, exp)
def _relogin(self, account, cate_id, cate_name): for i in range(0, 3): try: cookies, origin_cookies = spider_qt5_bootstrap(url=SpiderUrls.get_sycm_login_url(), account=account) self._execute_legality_token_actions(cookies=cookies, account=account) self._cookie_service.dump(cookies, account) return except CookieExpiredException as e: logger.warning('relogin cookie is expired,cate_id:%s,cate_name:%s,account:%s', cate_id, cate_name, account['username']) if i == 2: raise ExitException('exit when try login util max 3 times retry') except Exception as e: logger.error('retry error,retry times,%s,%s', i, e) if i == 2: raise ExitException('exit when try login util max 3 times retry') time.sleep(5)
def execute(self, num): tasks = self.execut_taobao_detail_actions() i = 0 future_tasks = {} for task in tasks: proxy = self._proxy.get_proxy() if i < num: future_tasks[self._executor.submit( self._execut_taobao_detail_actions, task, proxy)] = task i += 1 for future in as_completed(future_tasks): try: proxy = future.result() if proxy: self._proxy.remove_proxy(url=proxy['https']) except Exception as e: logger.error(e)
def load(self, account, path=None, type_=PickleFileType.cookie): if not path: path = '../../data/' account = account['username'] key = self.gen_by_account(account=account, type_=type_) if type_ == PickleFileType.cookie: # cookies = self.cookies_dict.get(key, None) # if cookies: # logger.info('load mem cookies success,account:[%s]', account) # return cookies data_file = Path(__file__).parent.joinpath(path + self.gen_by_account(account=account, type_=type_)) if not data_file.exists(): return None try: bytes = data_file.read_bytes() cookies = pickle.loads(bytes) logger.info('load cookies success,account:[%s]', account) self.cookies_dict[key] = cookies return cookies except Exception as e: logger.error('load cookies error,account:[%s],exp:%s', account, e) raise e elif type_ == PickleFileType.origin_cookie: data_file = Path(__file__).parent.joinpath(path + self.gen_by_account(account=account, type_=type_)) if not data_file.exists(): return None try: bytes = data_file.read_bytes() cookies = pickle.loads(bytes) logger.info('load origin cookies success,account:[%s]', account) return cookies except Exception as e: logger.error('load lorigin cookies error,account:[%s],exp:%s', account, e) raise e else: data_file = Path(__file__).parent.joinpath(path + self.gen_by_account(account=account, type_=type_)) if not data_file.exists(): return None try: bytes = data_file.read_bytes() legality_token = pickle.loads(bytes) logger.info('load legality_token success,account:[%s]', account) return legality_token except Exception as e: logger.error('load legality_token error,account:[%s],exp:%s', account, e) return ''
def execute(self, num): cycle_login_num = 0 tasks = self.execute_taobao_integrate_list_actions() i = 0 future_tasks = {} with read_session_scope() as session: _stream_risk_dao = get_stream_risk_dao(session=session) rsts = _stream_risk_dao.base_query.all() risk_usernames = set(item.raw_data for item in rsts) s_accounts = global_config.s_accounts for task in tasks: account = s_accounts[self._counter % len(s_accounts)] proxy = self._proxy_service.get_static_proxy(account['username']) # raw_data = task.raw_data # account = raw_data['account'] self._counter += 1 i += 1 if account['username'] in risk_usernames: continue if i < num: future_tasks[self._executor.submit( self._execute_taobao_integrate_list_actions, task, account, proxy)] = task for future in as_completed(future_tasks): try: account, flag, force = future.result() if flag: if force: with write_session_scope() as session: _stream_risk_dao = get_stream_risk_dao( session=session) self._risk(stream_risk_dao=_stream_risk_dao, account=account) # self._login(account=account, force=True if cycle_login_num == 0 else False) cycle_login_num += 1 else: self._fail_account_counter[account['username']] += 1 if self._fail_account_counter[account['username']] > 2: self._cookie_service.remove(account=account) with write_session_scope() as session: _stream_risk_dao = get_stream_risk_dao( session=session) self._risk(stream_risk_dao=_stream_risk_dao, account=account) # self._login(account=account, force=True if cycle_login_num == 0 else False) cycle_login_num += 1 else: url = 'https://s.m.taobao.com/h5?q=Flyco%2BFR5218&search=%E6%8F%90%E4%BA%A4&tab=all' # url = 'https://s.m.taobao.com/h5?q=Flyco%2BFR5218&search=%E6%8F%90%E4%BA%A4&tab=all' proxy = self._proxy_service.get_origin_static_proxy( account['username']) cookies = self._cookie_service.load( account=account, type_=PickleFileType.origin_cookie) time.sleep(5) cookies, origin_cookies = spider_qt5_bootstrap( url=url, account=account, risk=False, proxy=proxy, cookies=cookies) self._cookie_service.dump(cookies=cookies, account=account) self._cookie_service.dump( cookies=origin_cookies, account=account, type_=PickleFileType.origin_cookie) self._account_counter[account['username']] = 0 else: self._fail_account_counter[account['username']] = 0 self._account_counter[account['username']] += 1 if self._account_counter[account['username']] >= 2: url = 'https://s.m.taobao.com/h5?q=Flyco%2BFR5218&search=%E6%8F%90%E4%BA%A4&tab=all' # url = 'https://s.m.taobao.com/h5?q=Flyco%2BFR5218&search=%E6%8F%90%E4%BA%A4&tab=all' proxy = self._proxy_service.get_origin_static_proxy( account['username']) cookies = self._cookie_service.load( account=account, type_=PickleFileType.origin_cookie) time.sleep(5) cookies, origin_cookies = spider_qt5_bootstrap( url=url, account=account, risk=False, proxy=proxy, cookies=cookies) self._cookie_service.dump(cookies=cookies, account=account) self._cookie_service.dump( cookies=origin_cookies, account=account, type_=PickleFileType.origin_cookie) self._account_counter[account['username']] = 0 except Exception as e: logger.error(e)
def __execut_taobao_detail_actions(self, task, proxy=None): raw_data = task.raw_data good_result = Good(raw_data['goodResult']) model_name = good_result.get_model_name() cate_id = good_result.get_category_id() integrate_infos = raw_data['integrateInfos'] sale_infos = raw_data['saleInfos'] i = 0 j = 1 length = min(len(integrate_infos), len(sale_infos)) is_success = False context = Context() context.attach(Context.KEY_GOOD_DICT, good_result) context.attach(Context.KEY_CURRENT_TASK, task) context.attach(Context.KEY_CURRENT_PROXY, proxy) for x in range(0, length): is_need_retry = False if i < len(sale_infos): sale_info = sale_infos[i] sale_item_id = sale_info['itemId'] sale_title = sale_info['title'] sale_cate_id = sale_info['category'] sale_price = sale_info['price'] # str(sale_title).upper() # if str(sale_title).upper().find(str(model_name).upper()) != -1 and str(cate_id) == str(sale_cate_id): if str(sale_title).upper().find(str(model_name).upper()) != -1 and Category.check_cate_id(cate_id, sale_cate_id): actions = self.get_taobao_detail_actions() is_success = True price_info = [{ 'skuId': '-1', 'price': yuan_2_cent(sale_price) }] good_result.set_price_info(price_info=price_info) good_result.set_flag(str(int(GoodDataType.success))) for action in actions: action.execute(context=context) break elif i < 5: actions = self.get_taobao_http_detail_actions() timestamps = int(datetime.now().timestamp() * 1000) # sign = get_sign('414804c1e894540b7f18f703c74346cf', str(timestamps), '12574478', # '{"itemNumId":"%s"' % (sale_item_id)) sale_detail_url = SpiderUrls.get_taobao_detail_url(timestamps, '', sale_item_id) context = Context() context.attach(Context.KEY_GOOD_DICT, good_result) context.attach(Context.KEY_CURRENT_TASK, task) context.attach(Context.KEY_COOKIES, RequestsCookieJar()) context.attach(Context.KEY_IS_UPDATE_COOKIES, True) context.attach(Context.KEY_CURRENT_PROXY, proxy) detail_m_url = SpiderUrls.get_detail_m_url(sale_info['userType'], sale_item_id) detail_m_http_request = HttpRequest(detail_m_url, method=HttpMethod.GET) context.attach(Context.KEY_DETAIL_M_HTTP_REQUEST, detail_m_http_request) sale_http_request = HttpRequest(url=sale_detail_url, method=HttpMethod.GET) context.attach(Context.KEY_TAOBAO_DETAIL_HTTP_REQUEST, sale_http_request) context.attach(Context.KEY_HEADERS, SpiderHttp.get_taobao_headers(detail_m_url)) try: for action in actions: action.execute(context=context) is_success = True break except RetryException as e: logger.error(e) time.sleep(5) except InterruptException as e: logger.exception(e) time.sleep(10) is_need_retry = True # raise e # if is_success: # break if j < len(integrate_infos): integrate_info = integrate_infos[j] integrate_item_id = integrate_info['itemId'] integrate_title = integrate_info['title'] integrate_cate_id = integrate_info['category'] integrate_price = integrate_info['price'] # if str(integrate_title).upper().find( # str(model_name).upper()) != -1 and str(cate_id) == str(integrate_cate_id): if str(integrate_title).upper().find( str(model_name).upper()) != -1 and Category.check_cate_id(cate_id, integrate_cate_id): actions = self.get_taobao_detail_actions() is_success = True price_info = [{ 'skuId': '-2', 'price': yuan_2_cent(integrate_price) }] good_result.set_price_info(price_info=price_info) good_result.set_flag(str(int(GoodDataType.success))) for action in actions: action.execute(context=context) break elif j < 6: actions = self.get_taobao_http_detail_actions() timestamps = int(datetime.now().timestamp() * 1000) integrate_detail_url = SpiderUrls.get_taobao_detail_url(timestamps, '', integrate_item_id) context = Context() # referer = 'https://s.m.taobao.com/h5' # context.attach(Context.KEY_HEADERS, SpiderHttp.get_taobao_headers(referer)) context.attach(Context.KEY_GOOD_DICT, good_result) context.attach(Context.KEY_CURRENT_TASK, task) context.attach(Context.KEY_COOKIES, RequestsCookieJar()) context.attach(Context.KEY_IS_UPDATE_COOKIES, True) context.attach(Context.KEY_CURRENT_PROXY, proxy) detail_m_url = SpiderUrls.get_detail_m_url(integrate_info['userType'], integrate_item_id) detail_m_http_request = HttpRequest(detail_m_url, method=HttpMethod.GET) context.attach(Context.KEY_DETAIL_M_HTTP_REQUEST, detail_m_http_request) integrate_http_request = HttpRequest(url=integrate_detail_url, method=HttpMethod.GET) context.attach(Context.KEY_TAOBAO_DETAIL_HTTP_REQUEST, integrate_http_request) context.attach(Context.KEY_HEADERS, SpiderHttp.get_taobao_headers(detail_m_url)) try: for action in actions: action.execute(context=context) is_success = True break except RetryException as e: logger.exception(e) time.sleep(5) except InterruptException as e: logger.exception(e) time.sleep(10) is_need_retry = True # raise e if not is_need_retry: i += 1 j += 1 if not is_success: actions = self.get_taobao_detail_actions() good_result.set_flag(str(int(GoodDataType.not_found))) for action in actions: action.execute(context=context)
if i < num: future_tasks[self._executor.submit( self._execut_taobao_detail_actions, task, proxy)] = task i += 1 for future in as_completed(future_tasks): try: proxy = future.result() if proxy: self._proxy.remove_proxy(url=proxy['https']) except Exception as e: logger.error(e) def init(self): super().init() def init_argparse(self, parser): super().init_argparse(parser) def process(self): # return super().process() self.execute(5) time.sleep(3) if __name__ == "__main__": s = TaobaoDetailPageJob(40) logger.info("start to execute taobao_detail_page job") s.run() # s.process() logger.error("exit taobao_detail_page job")
# -*- coding: utf-8 -*- from apscheduler.schedulers.blocking import BlockingScheduler from config.config_loader import logger from mall_spider.spiders.actions.action_service import ActionService class SycmScheduleJob(ActionService, BlockingScheduler): def __init__(self): super().__init__() def handle(self): # self.execute_sycm_category_job_init_actions() self.add_job(self.execute_sycm_category_job_init_actions, 'cron', day_of_week='0-6', hour=10, minute=30, second=0) def run(self): self.handle() self.start() if __name__ == "__main__": s = SycmScheduleJob() logger.info("start to execute sycm_schedule job") s.run() # jobs = s.get_jobs() # print(jobs) logger.error("exit sycm_schedule job")
def execute_in_retry(self, context, http_request, data=None): method = http_request.method is_update_cookies = context.get(Context.KEY_IS_UPDATE_COOKIES, False) headers = context.get(Context.KEY_HEADERS, '') cookies = context.get(Context.KEY_COOKIES, RequestsCookieJar()) start_time = time.time() retry = int(default_retry) retry_interval = float(default_retry_interval) timeout = 25.0 connect_time_out = int(default_connect_timeout) proxies = context.get(Context.KEY_CURRENT_PROXY, '') account = context.get(Context.KEY_CURRENT_TASK_ACCOUNT, {}) while retry > 0: retry = retry - 1 response = None try: if proxies: logger.info('context key:[%s],proxy inject,[%s]->[%s]', context.context_key, account, proxies) if method == HttpMethod.GET: response = get(url=http_request.url, params=None, headers=headers, cookies=cookies, proxies=proxies) elif method == HttpMethod.POST: response = post(url=http_request.url, data=data, headers=headers, cookies=cookies, proxies=proxies, connect_timeout=connect_time_out, timeout=timeout) logger.debug(u'context key:[%s],action:[%s] execute result:%s', context.context_key, self.__class__.__name__, response.text) if response.status_code != 200: raise StatusCodeException(response.status_code) return response except ProxyError as e: logger.error('proxy error,[%s]->[%s],exp:%s', account, proxies, e) raise ProxyException(e) except ConnectTimeoutError as e: logger.error('proxy error,[%s]->[%s],exp:%s', account, proxies, e) raise ProxyException(e) except ReadTimeout as e: import sys exc_info = sys.exc_info() if time.time() - start_time > timeout or retry == 0: raise e # raise exc_info[0], exc_info[1], exc_info[2] logger.error( u'context key:[%s],action:[%s] execute read time out,exception:%s', context.context_key, self.__class__.__name__, traceback.format_exc()) except ConnectTimeout as e: import sys exc_info = sys.exc_info() if time.time() - start_time > timeout or retry == 0: raise e # raise exc_info[0], exc_info[1], exc_info[2] logger.error( u'context key:[%s],action:[%s] execute connect time out,exception:%s', context.context_key, self.__class__.__name__, traceback.format_exc()) except Exception as e: import sys exc_info = sys.exc_info() if time.time() - start_time > timeout or retry == 0: raise e # raise exc_info[0], exc_info[1], exc_info[2] logger.error( u'context key:[%s],action:[%s] execute error,exception:%s', context.context_key, self.__class__.__name__, traceback.format_exc()) finally: if is_update_cookies and response: cookies.update(response.cookies) time.sleep(retry_interval)
def _risk(self, stream_risk_dao, account): entity = stream_risk_dao.query_one(_filter=[ CmmSysStreamRisk.type == int(RiskType.taobao_search), CmmSysStreamRisk.raw_data == account['username'] ]) if not entity: entity = CmmSysStreamRisk() entity.raw_data = account['username'] entity.type = int(RiskType.taobao_search) stream_risk_dao.insert_entity(entity=entity) def init(self): super().init() def init_argparse(self, parser): super().init_argparse(parser) def process(self): # return super().process() self.execute(2) time.sleep(10) if __name__ == "__main__": s = TaobaoListPageJob(10) logger.info("start to execute taobao_list_page job") s.run() # s.process() logger.error("exit taobao_list_page job")
self._proxy_service = get_proxy_service() def execute(self): with write_session_scope() as session: _stream_risk_dao = get_stream_risk_dao(session=session) rsts = _stream_risk_dao.base_query.limit(self.account_num).all() if rsts: for item in rsts: username = item.raw_data account = global_config.s_accounts_dict[username] proxy = self._proxy_service.get_origin_static_proxy(account['username']) self._login(account=account, force=True, risk=True, proxy=proxy) _stream_risk_dao.delete(_filter=[CmmSysStreamRisk.id == item.id]) session.commit() def init(self): super().init() def init_argparse(self, parser): super().init_argparse(parser) def process(self): self.execute() if __name__ == "__main__": s = LoginJob(1) logger.info("start to execute login job") s.process() logger.error("exit login job")
tasks = self.execute_sycm_product_actions() logger.info("start to execute sycm tasks,tasks length:%s", len(tasks)) i = 0 for task in tasks: if i < num: self._execute_sycm_product_actions(task) # self._executor.submit(self._execute_sycm_product_actions, task) i += 1 time.sleep(15) def init(self): super().init() def init_argparse(self, parser): super().init_argparse(parser) def process(self): # return super().process() self.execute(10) time.sleep(10) # def run(self): # self.execute(self, maxInt) if __name__ == "__main__": s = SycmJob(1) logger.info("start to execute sycm job") s.run() logger.error("exit sycm job")