def release_proxy(group, task): """Handle proxy release """ tp = TaskProtocal(task) task_dct = tp.get_data() logger.info("release %s" % task_dct) if task_dct['proxy']: redis_execute(redis_client.sadd)(KEY_NAME, task_dct['proxy'])
def remove_proxy(group, task): """Handle proxy remove """ tp = TaskProtocal(task) task_dct = tp.get_data() logger.info("remove %s" % task_dct) if task_dct['proxy']: ret = redis_execute(redis_client.srem)(KEY_NAME, task_dct['proxy']) if ret: tp.set_to('output') return tp
async def handle_worker(group, task): """Handle statistic task """ tp = TaskProtocal(task) task_dct = tp.get_data() if 'extra' in task_dct and 'stats' in task_dct['extra']: tid = task_dct['extra']['stats'].get('tid') step = task_dct['extra']['stats'].get('step') if (tid, step) in TID_MAP: stats_name = TID_MAP[(tid, step)] data_dct.setdefault(stats_name, 0) data_dct[stats_name] += 1
async def handle_worker(group, task): """Handle amz_bsr_result task [input] task data format: JSON: { #product info "extra": { "bsr": { "bs_cate": [item["cate"]], "date": "xxxx-xx-xx" } } } [output] result data format: JSON: { #product info +"bs_cate": "cate", +"date": "2017-09-10", -"extra", } """ tp = TaskProtocal(task) info = tp.get_data() popt_dct = popt_map.get(info['platform'], {}) cat_name = info['detail_info']['cat_1_name'].strip().lower( ) if info['detail_info']['cat_1_name'] else '' cat_rank = info['detail_info']['cat_1_rank'] if info['detail_info'][ 'cat_1_rank'] is not None else -1 info['detail_info']['cat_1_sales'] = -1 if cat_name and cat_rank != -1 and popt_dct: info['detail_info']['cat_1_sales'] = CURVE_FUNC( cat_rank, *popt_dct.get(cat_name, popt_dct['default'])) if info.get('extra') and info['extra'].get('bsr'): info['bs_cate'] = info['extra']['bsr']['bs_cate'] info['date'] = info['extra']['bsr']['date'] del info['extra'] else: cate = '' if info['detail_info']['cat_ls']: cate = ':'.join(info['detail_info']['cat_ls'][0]['name_ls']) info['bs_cate'] = [cate] info['date'] = time.strftime("%Y-%m-%d", time.localtime()) res = pipeflow.Task(json.dumps(info).encode('utf-8')) res.set_to('output') return res
async def handle_worker(group, task): """Handle callback task """ tp = TaskProtocal(task) task_dct = tp.get_data() if 'extra' in task_dct and 'cb' in task_dct['extra']: url = task_dct['extra']['cb'].get('url') async with aiohttp.ClientSession(conn_timeout=7) as session: try: async with session.post( url, timeout=TIME_OUT, data=zlib.compress( json.dumps(task_dct).encode('utf-8'))) as resp: html = await resp.read() if resp.status != 200: logger.error('[%d] %s' % (resp.status, url)) except Exception as exc: logger.error('Request page fail : %s' % exc)
async def handle_worker(group, task): tp = TaskProtocal(task) f = tp.get_from() tid = tp.get_tid() step = tp.get_step() logger.info("ep: %s, tid: %s, step: %s" % (f, tid, step)) if tid not in flow_conf[FLOW_TASK_CONF]: logger.error("Task ID [%s] error" % tid) return task_ls = [] task_data = tp.get_data() if step+1 < len(flow_conf[FLOW_TASK_CONF][tid]): endpoint_name = flow_conf[FLOW_TASK_CONF][tid][step+1]['name'] next_tp = tp.new_task(task_data, next_step=True) next_tp.set_to(endpoint_name) task_ls.append(next_tp) for f_tid in flow_conf[FLOW_TASK_CONF][tid][step].get('fork', []): endpoint_name = flow_conf[FLOW_TASK_CONF][f_tid][0]['name'] fork_tp = tp.new_task(task_data, tid=f_tid) fork_tp.set_to(endpoint_name) task_ls.append(fork_tp) return task_ls
async def handle_task(group, task): """Handle amz_bsr_product task [input] task data format: JSON: { "platform": "amazon_us", "root_url": "https://www.amazon.de/gp/bestsellers", "category_filter": ["name1", ... ,"namex"] "with_qty": True #optional } [notify] task data format: BYTES: b"task done" """ global filter_ls global task_start global task_count from_name = task.get_from() if from_name == 'input': tp = TaskProtocal(task) task_dct = tp.get_data() if task_dct['platform'] not in PLATFORM_FILTER_LS: return if task_start: tp.set_to('input_back') return tp else: group.suspend_endpoint('input') task_start = True logger.info(task_dct['root_url']) filter_ls = [cate.lower() for cate in task_dct['category_filter']] task_dct['url'] = task_dct['root_url'] task_dct['date'] = time.strftime("%Y-%m-%d", time.localtime()) del task_dct['root_url'] del task_dct['category_filter'] new_tp = tp.new_task(task_dct) new_tp.set_to('inner_output') task_count = 1 return new_tp if from_name == 'notify' and task_start: if task.get_data() == b'task done': filter_ls = [] task_start = False group.resume_endpoint('input')
async def handle_task(group, task): """Handle amz_keyword task [input] task data format: JSON: { "platform": "amazon_us", "keyword": "xx xxx", "end_page": 10, } [notify] task data format: BYTES: b"task done" """ global task_count from_name = task.get_from() if from_name == 'input': tp = TaskProtocal(task) if task_count >= MAX_WORKERS: tp.set_to('input_back') return tp else: task_count += 1 if task_count >= MAX_WORKERS: group.suspend_endpoint('input') task_dct = tp.get_data() logger.info("%s %s" % (task_dct['platform'], task_dct['keyword'])) task_dct.setdefault('end_page', 20) task_dct['page'] = 1 task_dct['url'] = get_search_index_url(task_dct['platform'], task_dct['keyword']) new_tp = tp.new_task(task_dct) new_tp.set_to('inner_output') return new_tp if from_name == 'notify' and task_count: if task.get_data() == b'task done': task_count -= 1 if task_count + 1 == MAX_WORKERS: group.resume_endpoint('input')
async def handle_task(group, task): """Handle amz_qa task [input] task data format: JSON: { "platform": "amazon_us", "asin": "xxxx", "till": "qa id", } [notify] task data format: BYTES: b"task done" """ global task_count from_name = task.get_from() if from_name == 'input': tp = TaskProtocal(task) if task_count >= MAX_WORKERS: tp.set_to('input_back') return tp else: task_count += 1 if task_count >= MAX_WORKERS: group.suspend_endpoint('input') task_dct = tp.get_data() logger.info("%s %s %s" % (task_dct['platform'], task_dct['asin'], task_dct.get('till', ''))) task_dct["page"] = 1 new_tp = tp.new_task(task_dct) new_tp.set_to('inner_output') return new_tp if from_name == 'notify' and task_count: if task.get_data() == b'task done': task_count -= 1 if task_count + 1 == MAX_WORKERS: group.resume_endpoint('input')
async def handle_worker(group, task): """Handle amz_qa task [input] task data format: JSON: { "platform": "amazon_us", "asin": "xxxx", "till": "qa id", "page": 1, } [output] result data format: JSON: { "platform": "amazon_us", "asin": "xxxx", "page": 1, "end": true, "qas": [ { 'qa_id': 'xdf', 'vote': 5, 'question': 'qqq', 'answer': 'aaa', 'author': 'author', 'date': '2017-09-09', } ] } """ tp = TaskProtocal(task) task_dct = tp.get_data() handle_cls = get_spider_by_platform(task_dct['platform']) notify_task = pipeflow.Task(b'task done') notify_task.set_to('notify') url = get_url_by_platform(task_dct['platform'], task_dct['asin'], task_dct['page']) current_page = task_dct['page'] with GetPageSession() as sess: try: #sess = GetPageSession() html = await sess.get_page('get', url, timeout=60, captcha_bypass=True) soup = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8')) handle = handle_cls(soup) except BannedError as exc: tp.set_to('input_back') ban_tp = tp.new_task({'proxy': exc.proxy[7:]}) ban_tp.set_to('ban') return [ban_tp, tp] except RequestError: tp.set_to('inner_output') return tp except CaptchaError: tp.set_to('inner_output') return tp except Exception as exc: exc_info = (type(exc), exc, exc.__traceback__) taks_info = ' '.join([task_dct['platform'], url]) logger.error('Get page handle error\n' + taks_info, exc_info=exc_info) exc.__traceback__ = None return notify_task is_qa_page = handle.is_qa_page() # abandon result if not is_qa_page: return notify_task try: next_page, qa_ls = handle.get_info() except Exception as exc: exc_info = (type(exc), exc, exc.__traceback__) taks_info = ' '.join([task_dct['platform'], url]) logger.error('Get page info error\n' + taks_info, exc_info=exc_info) exc.__traceback__ = None return notify_task qa_id_ls = [item['qa_id'] for item in qa_ls] if 'till' in task_dct and task_dct['till'] in qa_id_ls: next_page = None i = qa_id_ls.index(task_dct['till']) qa_ls = qa_ls[:i] task_ls = [] if next_page: task_dct['page'] = next_page new_tp = tp.new_task(task_dct) new_tp.set_to('inner_output') task_ls.append(new_tp) else: task_ls.append(notify_task) if qa_ls: info = { 'platform': task_dct['platform'], 'asin': task_dct['asin'], 'page': current_page, 'qas': qa_ls } if not next_page: info['end'] = True new_tp = tp.new_task(info) new_tp.set_to('output') task_ls.append(new_tp) return task_ls
async def handle_worker(group, task): """Handle amz_bsr_product task [inner_input] task data format: JSON: { "platform": "amazon_us", "url": "https://www.amazon.de/gp/bestsellers", "date": "2017-08-08" "with_qty": True #optional } [output] result data format: JSON: { "platform": "amazon_us" "asin": "xxxx" "with_qty": True #optional "extra": { "bsr": { "bs_cate": [item["cate"]], "date": "xxxx-xx-xx" } } } """ global filter_ls global task_count global category_id_set global asin_set tp = TaskProtocal(task) task_dct = tp.get_data() try: task_count -= 1 handle_cls = get_spider_by_platform(task_dct['platform']) url = task_dct['url'] logger.info("%s" % (url, )) with GetPageSession() as sess: try: #sess = GetPageSession() html = await sess.get_page('get', url, timeout=60, captcha_bypass=True) soup = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8')) handle = handle_cls(soup) except BannedError as exc: tp.set_to('input_back') ban_tp = tp.new_task({'proxy': exc.proxy[7:]}) ban_tp.set_to('ban') return [ban_tp, tp] except RequestError: tp.set_to('inner_output') task_count += 1 return tp except CaptchaError: tp.set_to('inner_output') task_count += 1 return tp except Exception as exc: exc_info = (type(exc), exc, exc.__traceback__) taks_info = ' '.join([task_dct['platform'], url]) logger.error('Get page handle error\n' + taks_info, exc_info=exc_info) exc.__traceback__ = None return is_bsr_page = handle.is_bsr_page() # abandon result if not is_bsr_page: return is_exist = False reg = re.search(r'/(\d+)/ref=', url) if reg: cate_id = int(reg.group(1)) if cate_id not in category_id_set: category_id_set.add(cate_id) else: is_exist = True try: url_ls, asin_ls = handle.get_info(filter_ls, is_exist) except Exception as exc: exc_info = (type(exc), exc, exc.__traceback__) taks_info = ' '.join([task_dct['platform'], url]) logger.error('Get page info error\n' + taks_info, exc_info=exc_info) exc.__traceback__ = None return asin_ls = [item for item in asin_ls if item['asin'] not in asin_set] asin_set.update([item['asin'] for item in asin_ls]) task_ls = [] for url in url_ls: new_tp = tp.new_task({ 'platform': task_dct['platform'], 'url': url, 'date': task_dct['date'], 'with_qty': task_dct.get('with_qty', False) }) new_tp.set_to('inner_output') task_ls.append(new_tp) task_count += len(url_ls) for item in asin_ls: new_tp = tp.new_task({ 'platform': task_dct['platform'], 'asin': item['asin'], 'with_qty': task_dct.get('with_qty', False), 'extra': { 'bsr': { 'bs_cate': [item['cate']], 'date': task_dct['date'] } } }) new_tp.set_to('output') task_ls.append(new_tp) return task_ls finally: if group.get_running_cnt() == 1 and task_count == 0: category_id_set = set([]) asin_set = set([]) new_task = pipeflow.Task(b'task done') new_task.set_to('notify') return new_task
async def handle_worker(group, task): """Handle amz_review task [input] task data format: JSON: { "platform": "amazon_us", "asin": "xxxx", "till": "reveiw id", "url": "xxxx", } [output] result data format: JSON: { "platform": "amazon_us", "asin": "xxxx", "page": 1, "end": true, "reviews": [ { "review_id": "xdf", "rating": 4.0, "title": "title", "content": "content", "author": "author", "author_id": "author_id", "date": "2017-09-09", "verified_purchase": False, "imgs": [], } ] } """ tp = TaskProtocal(task) task_dct = tp.get_data() handle_cls = get_spider_by_platform(task_dct['platform']) notify_task = pipeflow.Task(b'task done') notify_task.set_to('notify') url = task_dct['url'] if not url: url = get_url_by_platform(task_dct['platform'], task_dct['asin']) with GetPageSession() as sess: try: #sess = GetPageSession() html = await sess.get_page('get', url, timeout=60, captcha_bypass=True) soup = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8')) handle = handle_cls(soup) except BannedError as exc: tp.set_to('input_back') ban_tp = tp.new_task({'proxy': exc.proxy[7:]}) ban_tp.set_to('ban') return [ban_tp, tp] except RequestError: tp.set_to('inner_output') return tp except CaptchaError: tp.set_to('inner_output') return tp except Exception as exc: exc_info = (type(exc), exc, exc.__traceback__) taks_info = ' '.join([task_dct['platform'], url]) logger.error('Get page handle error\n'+taks_info, exc_info=exc_info) exc.__traceback__ = None return notify_task is_review_page = handle.is_review_page() # abandon result if not is_review_page: return notify_task try: page_info, review_ls = handle.get_info() except Exception as exc: exc_info = (type(exc), exc, exc.__traceback__) taks_info = ' '.join([task_dct['platform'], url]) logger.error('Get page info error\n'+taks_info, exc_info=exc_info) exc.__traceback__ = None return notify_task ### just for redirect response situation if page_info['cur_page_url']: pr = parse.urlparse(page_info['cur_page_url']) query_dct = dict(parse.parse_qsl(pr.query)) if 'reviewerType' not in query_dct or 'pageSize' not in query_dct or 'sortBy' not in query_dct: new_url = get_url_by_platform(task_dct['platform'], task_dct['asin'], pr.path) task_dct['url'] = new_url new_tp = tp.new_task(task_dct) new_tp.set_to('inner_output') return new_tp if page_info['next_page_url']: page_info['next_page_url'] = formalize_url(task_dct['platform'], page_info['next_page_url']) review_id_ls = [item['review_id'] for item in review_ls] if 'till' in task_dct and task_dct['till'] in review_id_ls: page_info['next_page_url'] = None i = review_id_ls.index(task_dct['till']) review_ls = review_ls[:i] task_ls = [] if page_info['next_page_url']: task_dct['url'] = page_info['next_page_url'] new_tp = tp.new_task(task_dct) new_tp.set_to('inner_output') task_ls.append(new_tp) else: task_ls.append(notify_task) if review_ls: for item in review_ls: if not item['asin']: item['asin'] = task_dct['asin'] info = { 'platform': task_dct['platform'], 'asin': task_dct['asin'], 'page': page_info['cur_page'], 'reviews': review_ls } if not page_info['next_page_url']: info['end'] = True new_tp = tp.new_task(info) new_tp.set_to('output') task_ls.append(new_tp) return task_ls
async def handle_worker(group, task): """Handle amz_keyword task [input] task data format: JSON: { "platform": "amazon_us", "keyword": "xx xxx", "end_page": 10, "page": 1, "url": "xxxx", } [output] result data format: JSON: { "platform": "amazon_us", "keyword": "xx xxx", "page": 1, "end": true, "status": 0, "products": [ {'is_sponsored': 1, 'rank': 1, 'asin': 'B073S6F9JQ'} ], "count": 10, "category": ['xxx1','xx2'], "department": "xxxxx", } """ tp = TaskProtocal(task) task_dct = tp.get_data() notify_task = pipeflow.Task(b'task done') notify_task.set_to('notify') if task_dct['page'] > task_dct['end_page']: return notify_task handle_cls = get_spider_by_platform(task_dct['platform']) with GetPageSession() as sess: try: #sess = GetPageSession() html = await sess.get_page('get', task_dct['url'], timeout=60, captcha_bypass=True) soup = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8')) handle = handle_cls(soup) except BannedError as exc: tp.set_to('input_back') ban_tp = tp.new_task({'proxy': exc.proxy[7:]}) ban_tp.set_to('ban') return [ban_tp, tp] except RequestError: tp.set_to('inner_output') return tp except CaptchaError: tp.set_to('inner_output') return tp except Exception as exc: exc_info = (type(exc), exc, exc.__traceback__) taks_info = ' '.join([task_dct['platform'], task_dct['url']]) logger.error('Get page handle error\n' + taks_info, exc_info=exc_info) exc.__traceback__ = None tps = [notify_task] new_tp = tp.new_task({ 'platform': task_dct['platform'], 'keyword': task_dct['keyword'], 'page': task_dct['page'], 'end': True, 'status': 1, 'message': 'Get Page handle error' }) new_tp.set_to('output') tps.append(new_tp) return tps is_search_page = handle.is_search_page() if not is_search_page: return notify_task try: next_url = handle.get_next_url() asin_ls = handle.get_asins() result_dct = handle.get_search_result() department = handle.get_nav_search() except Exception as exc: exc_info = (type(exc), exc, exc.__traceback__) taks_info = ' '.join([task_dct['platform'], task_dct['url']]) logger.error('Get page info error\n' + taks_info, exc_info=exc_info) exc.__traceback__ = None return notify_task if next_url is not None: next_url = formalize_url(task_dct['platform'], next_url) task_ls = [] info = { 'platform': task_dct['platform'], 'keyword': task_dct['keyword'], 'page': task_dct['page'], 'products': asin_ls, 'count': result_dct['count'], 'category': result_dct['category'], 'department': department, } next_page = task_dct['page'] + 1 if next_url and next_page <= task_dct['end_page']: task_dct['page'] = next_page task_dct['url'] = next_url new_tp = tp.new_task(task_dct) new_tp.set_to('inner_output') task_ls.append(new_tp) else: info['end'] = True info['status'] = 0 task_ls.append(notify_task) new_tp = tp.new_task(info) new_tp.set_to('output') task_ls.append(new_tp) return task_ls
async def handle_worker(group, task): """Handle amz_product task [input] task data format: JSON: { "platform": "amazon_us", "asin": "B02KDI8NID8", "with_qty": True, } [output] result data format: JSON: { 'asin': 'B02KDI8NID8', 'platform': 'amazon_us', 'parent_asin': 'B02KDI8NID8', 'title': 'Active Wow Teeth Whitening Charcoal Powder Natural', 'brand': 'Active Wow', 'price': 24.79, 'discount': 0.83, 'merchant_id': 'A3RJPJ9XCKYOM5', 'merchant': 'MarketWeb', 'description': [], 'category': [], "product_info": { "product_dimensions": "2 x 2 x 2 inches ; 0.6 ounces", "shipping_weight": "3.2 ounces ()", "date_first_available": null }, 'detail_info': { 'cat_1_rank': 5, 'cat_1_name': 'Beauty & Personal Care', "cat_ls": [{"rank": 4, "name_ls": ["Health & Household", "Oral Care", "Teeth Whitening"]}], }, 'relative_info': { 'bought_together': [], 'also_bought': [], 'also_viewed': [], 'viewed_also_bought': [], 'sponsored_1': [], 'sponsored_2': [], 'compare_to_similar': [], }, 'sku_info': [], 'fba': 1, 'review': 4.6, 'review_count': 9812, "review_statistics": { "1": 6, "2": 2, "3": 3, "4": 9, "5": 80 }, 'img': 'https://images-na.ssl-images-amazon.com/images/I/514RSPIJMKL.jpg', 'imgs': [], 'qty': 123, #None } """ tp = TaskProtocal(task) from_end = tp.get_from() task_dct = tp.get_data() logger.info("%s %s %s" % (task_dct['platform'], task_dct['asin'], task_dct.get('with_qty', False))) handle_cls = get_spider_by_platform(task_dct['platform']) url = get_url_by_platform(task_dct['platform'], task_dct['asin']) qty_info = {} with GetPageSession() as sess: try: if not task_dct.get('with_qty'): #sess = GetPageSession() html = await sess.get_page('get', url, timeout=60, captcha_bypass=True) soup = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8')) handle = handle_cls(soup) else: #sess = GetPageSession() html = await sess.get_page('get', url, timeout=60, captcha_bypass=True) soup = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8')) handle = handle_cls(soup) offer_listing_id = handle.get_offer_listing_id() ue_id = handle.get_ue_id() session_id = handle.get_session_id() domain = get_domain_by_platform(task_dct['platform']) if offer_listing_id and ue_id and session_id: ### get ubid-main cookie collect_coupon_url = COLLECT_COUPON.format(domain=domain) data = {"pageReImpType": "aplImpressionPC"} headers = { 'Referer': url, "X-Requested-With": "XMLHttpRequest", "Content-Type": "application/x-www-form-urlencoded" } cookies = { 'csm-hit': 's-{ue_id:s}|{time:d}'.format(ue_id=ue_id, time=int(time.time() * 1000)) } await sess.get_page('post', collect_coupon_url, data=data, headers=headers, cookies=cookies, timeout=30) ### get qty add_to_cart_url = ADD_TO_CART.format(domain=domain) data = ADD_TO_CART_DATA.format( asin=task_dct['asin'], session_id=session_id, offer_listing_id=offer_listing_id, qty=999) headers = { 'Referer': url, "X-Requested-With": "XMLHttpRequest", "Content-Type": "application/x-www-form-urlencoded" } cookies = { 'csm-hit': '{ue_id:s}+s-{ue_id:s}|{time:d}'.format( ue_id=ue_id, time=int(time.time() * 1000)) } ret = await sess.get_page('post', add_to_cart_url, data=data, headers=headers, cookies=cookies, timeout=30) qty_info = json.loads(ret.decode('utf-8')) except BannedError as exc: tp.set_to(from_end) ban_tp = tp.new_task({'proxy': exc.proxy[7:]}) ban_tp.set_to('ban') return [ban_tp, tp] except RequestError: tp.set_to(from_end) return tp except CaptchaError: tp.set_to(from_end) return tp except Exception as exc: exc_info = (type(exc), exc, exc.__traceback__) taks_info = ' '.join([task_dct['platform'], task_dct['asin']]) logger.error('Get page handle error\n' + taks_info, exc_info=exc_info) exc.__traceback__ = None return is_product_page = handle.is_product_page() if not is_product_page: return try: info = handle.get_info() info['qty'] = int( qty_info['cartQuantity']) if qty_info.get('cartQuantity') else None # extra info info['asin'] = task_dct['asin'] info['platform'] = task_dct['platform'] new_tp = tp.new_task(info) new_tp.set_to('output') return new_tp except Exception as exc: exc_info = (type(exc), exc, exc.__traceback__) taks_info = ' '.join([task_dct['platform'], task_dct['asin']]) logger.error('Get page info error\n' + taks_info, exc_info=exc_info) exc.__traceback__ = None return
async def handle_worker(group, task): """Handle amz_product task [input] task data format: JSON: { "platform": "amazon_us", "asin": "B02KDI8NID8" } [output] result data format: JSON: { 'asin': 'B02KDI8NID8', 'platform': 'amazon_us', 'title': 'Active Wow Teeth Whitening Charcoal Powder Natural', 'brand': 'Active Wow', 'price': 24.79, 'discount': 0.83, 'merchant_id': 'A3RJPJ9XCKYOM5', 'merchant': 'MarketWeb', 'detail_info': { 'cat_1_rank': 5, 'cat_1_name': 'Beauty & Personal Care' }, 'relative_info': { 'bought_together': [], 'also_bought': [], }, 'fba': 1, 'review': 4.6, 'review_count': 9812, 'img': 'https://images-na.ssl-images-amazon.com/images/I/514RSPIJMKL.jpg' } """ tp = TaskProtocal(task) from_end = tp.get_from() task_dct = tp.get_data() logger.info("%s %s" % (task_dct['platform'], task_dct['asin'])) handle_cls = get_spider_by_platform(task_dct['platform']) url = get_url_by_platform(task_dct['platform'], task_dct['asin']) try: soup = await get_page(url, timeout=70) handle = handle_cls(soup) except RequestError: if from_end == 'routine_input': tp.set_to('routine_input_back') elif from_end == 'input': tp.set_to('input_back') return tp.to_task() except CaptchaError: if from_end == 'routine_input': tp.set_to('routine_input_back') elif from_end == 'input': tp.set_to('input_back') return tp.to_task() except Exception as exc: exc_info = (type(exc), exc, exc.__traceback__) taks_info = ' '.join([task_dct['platform'], task_dct['asin']]) logger.error('Get page handle error\n'+taks_info, exc_info=exc_info) exc.__traceback__ = None return is_product_page = handle.is_product_page() if not is_product_page: return try: info = handle.get_info() # extra info info['asin'] = task_dct['asin'] info['platform'] = task_dct['platform'] if task_dct.get('extra'): info['extra'] = task_dct['extra'] new_tp = tp.new_task(info) new_tp.set_to('output') return new_tp.to_task() except Exception as exc: exc_info = (type(exc), exc, exc.__traceback__) taks_info = ' '.join([task_dct['platform'], task_dct['asin']]) logger.error('Get page info error\n'+taks_info, exc_info=exc_info) exc.__traceback__ = None return