async def taks_1(browser, delay_order_spider, detail_page_spider, manager_page_spider, from_store, link_id_spider, list_page_spider): page_num = 1 while 1: try: completed = await list_page_spider.get_page(page_num) if completed == 1: page_num += 1 elif completed == 2: MySql.cls_update(t="tb_order_spider", set={"isDetaildown": 0}, c={"isDetaildown": 2, "fromStore": from_store}) MySql.cls_update(t="tb_order_spider", set={"isVerify": 0}, c={"isVerify": 2, "fromStore": from_store}) page_num = 1 elif completed == 'exit': break await my_async_sleep(20, random_sleep=True) await link_id_spider.save_link_id() await manager_page_spider.do_it() await detail_page_spider.get_page() exit_loop = await delay_order_spider.get_page() if exit_loop == 'exit': break except Exception as e: logger.error(str(e)) break await browser.close()
async def intercept_request(req): if re.search(r'https://item.taobao.com/item.htm', req.url): await req.continue_() elif re.search('item.taobao.com.*?noitem.htm.*?', req.url): link_id = re.findall("itemid=(\d+)", req.url)[0] MySql.cls_update(db_setting=TEST_SERVER_DB_TEST, t="tb_master", set={ "flag": "XiaJia", "isUsed": 1 }, c={"link_id": link_id}) await req.abort() else: await req.abort()
async def run(cls, login, browser, page, from_store): page_num = 1 list_spider = OrderListPageSpider(login, browser, page, from_store) while 1: completed = await list_spider.get_page(page_num) if completed == 1: page_num += 1 elif completed == 2: MySql.cls_update(t="tb_order_spider", set={"isDetaildown": 0}, c={ "isDetaildown": 2, "fromStore": from_store }) MySql.cls_update(t="tb_order_spider", set={"isVerify": 0}, c={ "isVerify": 2, "fromStore": from_store }) page_num = 1 await my_async_sleep(15, random_sleep=True)
async def parse(self, html): ms = MySql() self._item['SpiderDate'] = time_now() sku_map = re.search('skuMap.*?(\{.*)', html) match_xia_jia = re.search("此宝贝已下架", html) if match_xia_jia: self._item['flag'] = "XiaJia" if not sku_map: MySql.cls_update(db_setting=TEST_SERVER_DB_TEST, t="tb_master", set={ "isUsed": 1, "isMut": 0 }, c={"link_id": self._item['link_id']}) res = ms.get_dict(t="prices_tb", c={"link_id": self._item['link_id']}) if res: ms.update(t="prices_tb", set=self._item, c={"link_id": self._item['link_id']}) else: self._item['stockid'] = "no_match" self._item['SpiderDate'] = time_ago(minutes=60) self._item['need_to_update'] = 1 ms.insert(t="prices_tb", d=self._item) logger.info(self._item) else: MySql.cls_update(db_setting=TEST_SERVER_DB_TEST, t="tb_master", set={ "isUsed": 1, "isMut": 1 }, c={"link_id": self._item['link_id']}) doc = PyQuery(html) items = doc("li[data-value]").items() logger.debug(items) attr_map = {} if items: for item in items: attr_map[item.attr('data-value')] = item.find( 'span').text().replace("(", "(").replace(")", ")") sku_dict = json.loads(sku_map.group(1)) count = 1 for k, v in sku_dict.items(): sku_result = self._item.copy() if self._item['promotionprice'] > 0: discount = round( float(self._item['price_tb']) - float(self._item['promotionprice']), 4) sku_result['promotionprice'] = round( float(v.get('price')) - float(discount), 4) else: sku_result['promotionprice'] = 0 sku_result['skuId'] = v.get('skuId') sku_result['price_tb'] = v.get('price') sku_result['attribute'] = "-".join([ attr_map.get(r) for r in re.sub('^;|;$', "", k).split(";") ]) res = ms.get_dict(t="prices_tb", c={"skuId": sku_result['skuId']}) if res: ms.update(t="prices_tb", set=sku_result, c={"skuId": sku_result['skuId']}) else: sku_result['stockid'] = "no_match" + str(count) sku_result['SpiderDate'] = time_ago(minutes=60) sku_result['need_to_update'] = 1 ms.insert(t="prices_tb", d=sku_result) count += 1 logger.info(sku_result) del ms await self._goto_the_next()
def _get_html(self): for shop_id in self._get_shop_id(): page_num, used_page_nums, total_page, sp_time = self._get_page_num( shop_id) session = requests.Session() while page_num: time.sleep(2) curl = self._get_curls(shop_id) if not curl: time.sleep(30) continue start_time = time.time() delete(flag='tspi') url, params, cookies, headers = self.format_request_params( curl['curl'], page_num) while 1: try: proxy = read("proxy") logger.info(proxy) if not proxy: self._set_proxy() proxies = {"https": "https://{}".format(proxy)} r = session.get(url=url, params=params, cookies=cookies, headers=headers, proxies=proxies, stream=True, timeout=30) except Exception as e: logger.error(str(e)) self._set_proxy() session = requests.Session() continue else: break try: html = r.text.replace("\\", "") except requests.exceptions.ChunkedEncodingError: continue except requests.exceptions.ConnectionError: continue html = re.sub("jsonp\d+\(\"|\"\)", "", html) yield html, shop_id, used_page_nums, total_page, page_num spent_time = int(time.time() - start_time) + sp_time tspi = read(flag="tspi") if tspi: tspi['spent_time'] = spent_time MySql.cls_update(db_setting=test_db, t="tb_search_page_info", set=tspi, c={"shop_id": shop_id}) page_num, used_page_nums, total_page, sp_time = self._get_page_num( shop_id) sql = "UPDATE tb_master SET flag='XiaJia',update_date='{}' WHERE shop_id='{}' AND update_date<'{}'".format( datetime.date.today(), shop_id, datetime.date.today()) MySql.cls_update(db_setting=test_db, sql=sql) if SEARCH_PAGE_REPORT: reports = Reports() reports.report([ids for ids in self._get_shop_id()]) ms = MySql(db_setting=test_db) t = TBMasterItem() t.save_to_record(ms)