async def taks_1(browser, delay_order_spider, detail_page_spider, manager_page_spider, from_store, link_id_spider, list_page_spider): page_num = 1 while 1: try: completed = await list_page_spider.get_page(page_num) if completed == 1: page_num += 1 elif completed == 2: MySql.cls_update(t="tb_order_spider", set={"isDetaildown": 0}, c={"isDetaildown": 2, "fromStore": from_store}) MySql.cls_update(t="tb_order_spider", set={"isVerify": 0}, c={"isVerify": 2, "fromStore": from_store}) page_num = 1 elif completed == 'exit': break await my_async_sleep(20, random_sleep=True) await link_id_spider.save_link_id() await manager_page_spider.do_it() await detail_page_spider.get_page() exit_loop = await delay_order_spider.get_page() if exit_loop == 'exit': break except Exception as e: logger.error(str(e)) break await browser.close()
def get(self, shop_ids): mail_container = self.init_receivers() ms = MySql(db_setting=TEST_SERVER_DB_TEST) for shop_id in shop_ids: shop_name = MySql.cls_get_one( sql="SELECT shopname FROM shop_info WHERE shop_id={}".format( shop_id)) flag_report_groups = [] # sql = "SELECT COUNT(id) AS nums FROM tb_master where shop_id='{}' and update_date<'{}'".format(shop_id, # datetime.date.today()) # nums = mysql.get_data(db=test_server, sql=sql, return_one=True) # if int(nums) > 0: # flag_report_groups.append("下架商品{}条".format(nums)) # del sql # del nums for flag in self.translate_dictionary.keys(): sql = "SELECT COUNT(id) AS nums FROM tb_master WHERE flag LIKE '%%{}%%' AND shop_id='{}'".format( flag, shop_id) nums = ms.get_one(db=TEST_SERVER_DB_TEST, sql=sql) if int(nums) > 0: flag_report_groups.append("{}{}条".format( self.translate_dictionary[flag], nums)) if flag_report_groups: result = ms.get_dict(t="tb_search_page_info", c={"shop_id": shop_id}) flag_report_groups.append("总计爬取{}页".format( result[0]['total_page'])) flag_report_groups.append("总计花费{}分{}秒".format( int(result[0]['spent_time'] / 60), int(result[0]['spent_time'] % 60))) flag_report_groups.reverse() flag_report_groups.append(shop_name) flag_report_groups.reverse() if shop_id in ["115443253", "33817767"]: mail_container["KY"]['mail_content'] += "|".join( flag_report_groups) + "\n" mail_container["KY"]['mail_content'] += self.insert_link( shop_id, ms) elif shop_id in ["34933991", "131282813"]: mail_container["TB"]['mail_content'] += "|".join( flag_report_groups) + "\n" mail_container["TB"]['mail_content'] += self.insert_link( shop_id, ms) elif shop_id in ["68559944", "60299985"]: mail_container["YJ"]['mail_content'] += "|".join( flag_report_groups) + "\n" mail_container["YJ"]['mail_content'] += self.insert_link( shop_id, ms) else: mail_container["YK"]['mail_content'] += "|".join( flag_report_groups) + "\n" mail_container["YK"]['mail_content'] += self.insert_link( shop_id, ms) return mail_container
async def input_verify_code(self, frame, fromStore, type): logger.info("需要要手机验证码") ms = MySql(db_setting=TEST_SERVER_DB_TEST) ms.delete(t='phone_verify', c={'fromStore': fromStore}) ms.insert(t="phone_verify", d={"fromStore": fromStore}) mail(fromStore + "手机验证码", fromStore + "登陆需要手机验证码", MAIL_RECEIVERS) verify_code = "0" while 1: if type == 0: await frame.click(PHONE_GET_CODE[0]) else: await frame.click(PHONE_GET_CODE[1]) for i in range(120): await asyncio.sleep(5) verify_code = ms.get_one(t='phone_verify', cn=['verify_code'], c={"fromStore": fromStore}) if verify_code != "0": ms.delete(t='phone_verify', c={'fromStore': fromStore}) del ms break if verify_code != "0": break await asyncio.sleep(10) if type == 0: await frame.type(PHONE_CHECK_INPUT[0], verify_code, {'delay': self.input_time_random() - 50}) await frame.click(PHONE_SUBMIT_BTN[0]) else: await frame.type(PHONE_CHECK_INPUT[1], verify_code, {'delay': self.input_time_random() - 50}) await frame.click(PHONE_SUBMIT_BTN[1])
async def run(): while 1: update() ms = MySql(db_setting=TEST_SERVER_DB_TEST) ms.update(t="spider_monitor", set={"latest_time": time_now()}, c={"spider_address": SPIDER_ADDRESS}) restart_signal = ms.get_one(t="spider_monitor", cn=["restart_signal"], c={"spider_address": SPIDER_ADDRESS}) if SPIDER_ADDRESS == "3_floor": sql = "SELECT MAX(updateTime) as updateTime,fromStore FROM tb_order_spider WHERE fromStore IN ('KY','TB') GROUP BY fromStore" else: sql = "SELECT MAX(updateTime) as updateTime,fromStore FROM tb_order_spider WHERE fromStore IN ('YJ','YK') GROUP BY fromStore" results = MySql.cls_get_dict(sql=sql) t = time_ago(minutes=15) for result in results: if str(result['updateTime']) < t: restart_signal = 1 break if restart_signal: ms.update(t="spider_monitor", set={"restart_signal": 0}, c={"spider_address": SPIDER_ADDRESS}) restart() del ms await asyncio.sleep(60)
def update(): ms = MySql(db_setting=TEST_SERVER_DB_TEST) update_signals = ms.get_dict(t="spider_monitor", cn=["spider_address", "update_signal"]) for update_signal in update_signals: if update_signal['update_signal']: if SPIDER_ADDRESS == update_signal['spider_address']: result = check_output(['git', 'pull']) ms.update(t='spider_monitor', set={ "update_signal": 0, "update_result": result.decode('utf-8').strip() }, c={"spider_address": SPIDER_ADDRESS})
async def intercept_request(req): if re.search(r'https://item.taobao.com/item.htm', req.url): await req.continue_() elif re.search('item.taobao.com.*?noitem.htm.*?', req.url): link_id = re.findall("itemid=(\d+)", req.url)[0] MySql.cls_update(db_setting=TEST_SERVER_DB_TEST, t="tb_master", set={ "flag": "XiaJia", "isUsed": 1 }, c={"link_id": link_id}) await req.abort() else: await req.abort()
def _get_item(): column_name = [ "shop_id", "link_id", "description", "price_tb", "promotionprice", "sales", "rates", ] while 1: results = MySql.cls_get_dict(db_setting=TEST_SERVER_DB_TEST, t="tb_master", c={ "isUsed": 0, "isMut": 1, "flag!": "XiaJia" }, cn=column_name, l=["0", "1"]) if results: results[0]['price_tb'] = float(results[0]['price_tb']) results[0]['promotionprice'] = float( results[0]['promotionprice']) results[0]['typeabbrev'] = store_trans(results[0]['shop_id'], 'id_2_code') return results[0] else: logger.info('没有数据需要爬取!') my_sleep()
async def parse(self, main_orders, page_num): # print(main_orders) ms = MySql() t = time_zone(["08:00", "23:00", "23:59"]) a = datetime.datetime.now() if a < t[0]: eoc = EARLIEST_ORDER_CREATE_TIME elif t[0] < a < t[1]: eoc = 2 else: eoc = 60 for i in range(len(main_orders)): continue_code = 0 # 有些订单的商品,在未付款时就已经退掉了,所以直接直接将数据进行删除 # 解析并保存订单到数据库 sub_orders, tb_order_item = await self.parse_order_item( i, main_orders, ms) if not sub_orders: return # 解析并保存订单详细商品到数据库 await self.parse_order_detail_item(continue_code, i, main_orders, sub_orders, tb_order_item, ms) date = datetime.date.today() date_limit = ( date - datetime.timedelta(eoc)).strftime("%Y-%m-%d %H:%M:%S") if tb_order_item.createTime < date_limit: logger.info("完成本轮爬取,共翻 " + str(page_num) + " 页。") self.completed = 2 del ms return self.completed = 1 del ms
def verify(): l_orderNo = [] column_name = [ 'orderNo', 'deliverFee', 'actualFee', 'couponPrice', 'fromStore', 'orderStatus' ] condition = {'isVerify': '0', 'isDetaildown': '1'} # kwargs = {'isVerify': '2', 'isDetaildown': '1'} ms = MySql() result = ms.get(t="tb_order_spider", cn=column_name, c=condition) if result: for i in result: total = 0 orderNo = i[0] deliverFee = i[1] actualFee = i[2] couponPrice = i[3] fromStore = i[4] column_name = ['unitPrice', 'sellNum', 'unitBenefits'] condition = {'orderNo': orderNo} result2 = ms.get(t="tb_order_detail_spider", cn=column_name, c=condition) for j in result2: unitPrice = j[0] sellNum = j[1] unitBenefits = j[2] total = total + unitPrice * sellNum - unitBenefits a = round(total, 3) + deliverFee - actualFee - couponPrice if abs(a) > 0.0001 and i[5] != '交易关闭': list_tmp = [] list_tmp.append(str(round(total, 2))) list_tmp.append(str(deliverFee)) list_tmp.append(str(actualFee)) list_tmp.append(str(couponPrice)) list_tmp.append(str(a)) list_tmp.append(store_trans(fromStore)) list_tmp.append(orderNo) l_orderNo.append("|".join(list_tmp)) ms.update(t="tb_order_spider", set={ 'isVerify': 2, 'isDetaildown': 0 }, c={'orderNo': orderNo}) else: ms.update(t="tb_order_spider", set={'isVerify': 1}, c={'orderNo': orderNo}) # print('没有异常数据,验证完成!') pass if l_orderNo: s = "\n".join(l_orderNo) # print(s) mail("数据异常报告", s, ["*****@*****.**"])
def _get_curls(shop_id): curls = [] results = MySql.cls_get_dict(db_setting=test_db, t="tb_search_curl", c={'shop_id': shop_id}) for res in results: curls.append(res) if curls: return random.choice(curls) else: return 0
async def run(cls, login, browser, page, from_store): page_num = 1 list_spider = OrderListPageSpider(login, browser, page, from_store) while 1: completed = await list_spider.get_page(page_num) if completed == 1: page_num += 1 elif completed == 2: MySql.cls_update(t="tb_order_spider", set={"isDetaildown": 0}, c={ "isDetaildown": 2, "fromStore": from_store }) MySql.cls_update(t="tb_order_spider", set={"isVerify": 0}, c={ "isVerify": 2, "fromStore": from_store }) page_num = 1 await my_async_sleep(15, random_sleep=True)
def _get_page_num(shop_id): # 从数据库得到数据 ms = MySql(db_setting=test_db) result = ms.get_dict(t="tb_search_page_info", c={"shop_id": shop_id}) if not result: # 没有数据就新增一个默认数据 d = { "shop_id": shop_id, "total_page": 20, "used_page_nums": "0", "last_date": datetime.date.today(), "spent_time": 0 } # 插入数据后再重新获取 ms.insert(t="tb_search_page_info", d=d) result = ms.get_dict(t="tb_search_page_info", c={"shop_id": shop_id}) if result[0]['last_date'] < datetime.date.today(): ms.update(t="tb_search_page_info", set={ "used_page_nums": "0", "spent_time": 0 }, c={"shop_id": shop_id}) result = ms.get_dict(t="tb_search_page_info", c={"shop_id": shop_id}) # 获取已采集的数据的页码列表 used_page_nums = [ int(x) for x in result[0]['used_page_nums'].split(",") ] total_page = result[0]['total_page'] set_a = set([i for i in range(total_page + 1)]) # 全部页码的set集合 set_b = set(used_page_nums) # 已采集的数据的页码集合 list_result = list(set_a - set_b) # 未采集数据的页码列表 if list_result: # 返回一个随机的未采集数据的页码,已采集的页码集合,和总的页码数 return random.choice( list_result ), used_page_nums, total_page, result[0]['spent_time'] else: # 如果没有未采集的页码,则表示当前店铺的所有页码全部采集完成 return 0, 0, 0, 0
def _get_order_info(self): today = datetime.datetime.now() one_day = datetime.timedelta(minutes=60) earlier_15_minutes = today - one_day updateTime = earlier_15_minutes.strftime("%Y-%m-%d %H:%M:%S") payTime = yesterday("18:00:00") sql = """ SELECT tos.orderNo,createTime FROM tb_order_spider tos WHERE tos.updateTime<'{}' AND tos.`orderStatus` = '买家已付款' AND tos.`fromStore` = '{}' AND tos.payTime<'{}' ORDER BY updateTime; """.format(updateTime, self.fromStore, payTime) res = MySql.cls_get_dict(sql=sql) order_no = None days = 0 if res: order_no = res[0]['orderNo'] days = (today - res[0]['createTime']).days return days, order_no
async def save_link_id(self): ms = MySql() link_id_new_list = [] self.completed = 0 sql = """SELECT url,a.orderNo FROM tb_order_detail_spider a JOIN tb_order_spider b ON a.`orderNo`=b.`orderNo` WHERE link_id="1" AND b.`fromStore`='{}' AND a.url IS NOT NULL GROUP BY a.orderNo ORDER BY b.createTime DESC""".format(self.fromStore) results = ms.get_dict(sql=sql) if results: for result in results: logger.info("link_id_spider-" + result['orderNo']) data = await self._get_json(result['orderNo']) if not data: return 0 sub_orders = data["data"]["subOrderViewDTOs"] for so in sub_orders: price_tb_item = PriceTBItem() price_tb_item.link_id = so["itemId"] order_no = so["orderNoStr"] sql = "select * from tb_order_detail_spider where url like '%%{}%%'".format( order_no) res = ms.get_dict(sql=sql)[0] price_tb_item.stockid = res['goodsCode'] price_tb_item.description = res['tbName'] price_tb_item.price_tb = res['unitPrice'] price_tb_item.shop_id = store_trans(string=self.fromStore, action="code_2_id") price_tb_item.attribute = res['goodsAttribute'] price_tb_item.typeabbrev = self.fromStore sql = "update tb_order_detail_spider set link_id='{}' where url like '%{}%'".format( price_tb_item.link_id, order_no) ms.update(sql=sql) price_tb_item.save(ms) await my_async_sleep(3, True)
async def do_it(self): shop_id = store_trans(self.fromStore, 'code_2_id') ms = MySql() sql = "select link_id from prices_tb where need_to_update=1 and shop_id='{}' limit 1".format( shop_id) link_id = ms.get_one(sql=sql) if not link_id: return 0 await self.page.bringToFront() try: if not re.search( "https://item.manager.taobao.com/taobao/manager/render.htm", self.page.url): await self.page.goto( "https://item.manager.taobao.com/taobao/manager/render.htm?tab=on_sale" ) except Exception as e: logger.error(str(e) + "manager_page_error") return while 1: await self.page.waitForSelector("input[name='queryItemId']", timeout=0) await self.page.keyboard.press('Escape') await self.page.focus("input[name='queryItemId']") for _ in range(20): await self.page.keyboard.press("Delete") await self.page.keyboard.press("Backspace") await self.page.type("input[name='queryItemId']", str(link_id), {'delay': self.login.input_time_random()}) await self.page.click(".filter-footer button:first-child") await self.page.waitForResponse( "https://item.manager.taobao.com/taobao/manager/table.htm") await asyncio.sleep(1) await self.listening(self.page) try: await self.page.waitForSelector(FAST_EDIT_BTN, timout=10000) await self.page.click(FAST_EDIT_BTN) restart = await self.login.slider(self.page) if restart: exit("滑块验证码失败,退出") except errors.TimeoutError as e: logger.info("商品已下架,没有查询到对应的商品ID:" + link_id) ms.update(t="prices_tb", set={ "SpiderDate": time_now(), "need_to_update": 0, "flag": "XiaJia" }, c={"link_id": link_id}) link_id = ms.get_one(sql=sql) if not link_id: return 0 continue else: await self.page.focus("input[name='queryItemId']") for _ in range(20): await self.page.keyboard.press("Delete") await self.page.keyboard.press("Backspace") break while 1: if self.completed == 4: break await asyncio.sleep(1) await asyncio.sleep(15)
async def parse_item_page(self, content=None, detail=None, rate=None): if content: sku_map = re.search('skuMap.*?(\{.*)', content) shop_id = store_trans(self.fromStore, 'code_2_id') doc = PyQuery(content) items = doc("li[data-value]").items() logger.debug(items) attr_map = {} if items: for item in items: attr_map[item.attr('data-value')] = item.find( 'span').text().replace("(", "(").replace(")", ")") if sku_map: sku_dict = json.loads(sku_map.group(1)) for k, v in sku_dict.items(): for price_tb_item in self.price_tb_items: if price_tb_item.skuId == v.get('skuId'): price_tb_item.price_tb = v.get('price') price_tb_item.shop_id = shop_id price_tb_item.attribute_map = k price_tb_item.attribute = "-".join([ attr_map.get(r) for r in re.sub('^;|;$', "", k).split(";") ]) else: self.price_tb_items[0].shop_id = shop_id self.price_tb_items[0].price_tb = doc( 'input[name="current_price"]').val() self.completed = 2 if detail: while 1: if self.completed == 2: break await asyncio.sleep(1) logger.debug(detail) detail = re.sub(r'span class=\"wl-yen\"', r'span class=\\"wl-yen\\"', detail) data = re.search('uccess\((.*?)\);', detail) if data: x = json.loads(data.group(1)) else: await self.login.slider(self.item_page) return promo_data = jsonpath(x, '$..promoData') for price_tb_item in self.price_tb_items: price_tb_item.sales = jsonpath(x, '$..soldTotalCount')[0] price_tb_item.typeabbrev = self.fromStore if promo_data and promo_data[0]: if price_tb_item.attribute_map: for k, v in promo_data[0].items(): if k == price_tb_item.attribute_map: price_tb_item.promotionprice = jsonpath( v, '$..price')[0] else: price_tb_item.promotionprice = jsonpath( x, '$..promoData..price')[0] self.completed = 3 if rate: while 1: if self.completed == 3: break await asyncio.sleep(1) logger.debug(rate) ms = MySql() for price_tb_item in self.price_tb_items: count = re.search('count.*?(\d+)', rate) if count: price_tb_item.rates = count.group(1) price_tb_item.need_to_update = 0 price_tb_item.save(ms) # print(price_tb_item) self.price_tb_items[0].delete(ms) del ms self.completed = 4
async def parse_order_detail_item(continue_code, i, main_orders, sub_orders, tb_order_item, ms): for j in range(len(sub_orders)): tb_order_detail_item = TBOrderDetailItem() tb_order_detail_item.orderNo = main_orders[i]["id"] tb_order_detail_item.itemNo = j try: tb_order_detail_item.goodsCode = sub_orders[j]['itemInfo'][ 'extra'][0]['value'] except KeyError: tb_order_detail_item.goodsCode = 'error' tb_order_detail_item.tbName = format_tb_name( sub_orders[j]['itemInfo']['title']) tb_order_detail_item.unitPrice = sub_orders[j]['priceInfo'][ 'realTotal'] tb_order_detail_item.sellNum = sub_orders[j]['quantity'] tb_order_detail_item.orderStatus = tb_order_item.orderStatus tb_order_detail_item.url = "https:" + sub_orders[j]['itemInfo'][ 'itemUrl'] try: attribute_list = sub_orders[j]['itemInfo']['skuText'] except KeyError: pass else: tb_order_detail_item.goodsAttribute = format_attribute( attribute_list) try: operations = sub_orders[j]['operations'] except KeyError: pass else: for x in range(len(operations)): t = operations[x]['style'] if t in ['t12', 't16'] and operations[x]['text'] != "退运保险": tb_order_detail_item.refundStatus = operations[x][ 'text'] tb_order_detail_item.isRefund = "1" elif t == 't0' and operations[x]['text'] == '已取消': continue_code = 1 delete_item = { 'orderNo': tb_order_detail_item.orderNo, 'itemNo': tb_order_detail_item.itemNo, 'goodsCode': tb_order_detail_item.goodsCode } ms = MySql() is_exist = ms.get(t="tb_order_detail_spider", l=1, c=delete_item) if is_exist: ms.delete(t="tb_order_detail_spider", c=delete_item) sql = "UPDATE tb_order_detail_spider SET itemNo=itemNo-1 " \ "WHERE orderNo='{}' " \ "AND itemNo>'{}'".format(tb_order_detail_item.orderNo, tb_order_detail_item.itemNo) ms.update(sql=sql) pass if continue_code: continue tb_order_detail_item.save(ms)
def _get_html(self): for shop_id in self._get_shop_id(): page_num, used_page_nums, total_page, sp_time = self._get_page_num( shop_id) session = requests.Session() while page_num: time.sleep(2) curl = self._get_curls(shop_id) if not curl: time.sleep(30) continue start_time = time.time() delete(flag='tspi') url, params, cookies, headers = self.format_request_params( curl['curl'], page_num) while 1: try: proxy = read("proxy") logger.info(proxy) if not proxy: self._set_proxy() proxies = {"https": "https://{}".format(proxy)} r = session.get(url=url, params=params, cookies=cookies, headers=headers, proxies=proxies, stream=True, timeout=30) except Exception as e: logger.error(str(e)) self._set_proxy() session = requests.Session() continue else: break try: html = r.text.replace("\\", "") except requests.exceptions.ChunkedEncodingError: continue except requests.exceptions.ConnectionError: continue html = re.sub("jsonp\d+\(\"|\"\)", "", html) yield html, shop_id, used_page_nums, total_page, page_num spent_time = int(time.time() - start_time) + sp_time tspi = read(flag="tspi") if tspi: tspi['spent_time'] = spent_time MySql.cls_update(db_setting=test_db, t="tb_search_page_info", set=tspi, c={"shop_id": shop_id}) page_num, used_page_nums, total_page, sp_time = self._get_page_num( shop_id) sql = "UPDATE tb_master SET flag='XiaJia',update_date='{}' WHERE shop_id='{}' AND update_date<'{}'".format( datetime.date.today(), shop_id, datetime.date.today()) MySql.cls_update(db_setting=test_db, sql=sql) if SEARCH_PAGE_REPORT: reports = Reports() reports.report([ids for ids in self._get_shop_id()]) ms = MySql(db_setting=test_db) t = TBMasterItem() t.save_to_record(ms)
def parse(self): for html, shop_id, used_page_nums, total_page, page_num in self._get_html( ): doc = PyQuery(html) match = re.search("item\dline1", html) if not match: MySql.cls_delete(db_setting=test_db, t='tb_search_curl', c={"shop_id": shop_id}) mail("店铺搜索页爬虫出错", shop_id + "错误页码:" + str(page_num) + "\n" + html, MAIL_RECEIVERS) continue used_page_nums.append(page_num) used_page_nums.sort() tspi = { # tb_search_page_info "used_page_nums": ",".join([str(x) for x in used_page_nums]), "last_date": datetime.date.today() } write(flag="tspi", value=tspi) num = doc(".pagination span.page-info").text() try: total_page_num = re.search("\d+\/(\d+)", num).group(1) except Exception as e: logger.error(str(e)) else: if int(total_page_num) != int(total_page): tspi['total_page'] = total_page_num write(flag="tspi", value=tspi) items = doc("." + match.group() + " dl.item").items() ms = MySql(db_setting=test_db) ms_prod = MySql() for i in items: tb_master_item = TBMasterItem() tb_master_item.shop_id = shop_id tb_master_item.link_id = i.attr('data-id') tb_master_item.description = i.find("dd.detail a").text() cprice = float(i.find("div.cprice-area span.c-price").text()) if i.find("div.sprice-area span.s-price").text(): sprice = float( i.find("div.sprice-area span.s-price").text()) else: sprice = 0 if i.find("div.sale-area span.sale-num").text(): tb_master_item.sales = int( i.find("div.sale-area span.sale-num").text()) if i.find("dd.rates a span").text(): tb_master_item.rates = int( i.find("dd.rates a span").text()) if sprice: tb_master_item.price_tb = sprice tb_master_item.promotionprice = cprice else: tb_master_item.price_tb = cprice tb_master_item.promotionprice = sprice print(tb_master_item) tb_master_item.save(ms, ms_prod) del ms, ms_prod
async def parse(self, html): ms = MySql() self._item['SpiderDate'] = time_now() sku_map = re.search('skuMap.*?(\{.*)', html) match_xia_jia = re.search("此宝贝已下架", html) if match_xia_jia: self._item['flag'] = "XiaJia" if not sku_map: MySql.cls_update(db_setting=TEST_SERVER_DB_TEST, t="tb_master", set={ "isUsed": 1, "isMut": 0 }, c={"link_id": self._item['link_id']}) res = ms.get_dict(t="prices_tb", c={"link_id": self._item['link_id']}) if res: ms.update(t="prices_tb", set=self._item, c={"link_id": self._item['link_id']}) else: self._item['stockid'] = "no_match" self._item['SpiderDate'] = time_ago(minutes=60) self._item['need_to_update'] = 1 ms.insert(t="prices_tb", d=self._item) logger.info(self._item) else: MySql.cls_update(db_setting=TEST_SERVER_DB_TEST, t="tb_master", set={ "isUsed": 1, "isMut": 1 }, c={"link_id": self._item['link_id']}) doc = PyQuery(html) items = doc("li[data-value]").items() logger.debug(items) attr_map = {} if items: for item in items: attr_map[item.attr('data-value')] = item.find( 'span').text().replace("(", "(").replace(")", ")") sku_dict = json.loads(sku_map.group(1)) count = 1 for k, v in sku_dict.items(): sku_result = self._item.copy() if self._item['promotionprice'] > 0: discount = round( float(self._item['price_tb']) - float(self._item['promotionprice']), 4) sku_result['promotionprice'] = round( float(v.get('price')) - float(discount), 4) else: sku_result['promotionprice'] = 0 sku_result['skuId'] = v.get('skuId') sku_result['price_tb'] = v.get('price') sku_result['attribute'] = "-".join([ attr_map.get(r) for r in re.sub('^;|;$', "", k).split(";") ]) res = ms.get_dict(t="prices_tb", c={"skuId": sku_result['skuId']}) if res: ms.update(t="prices_tb", set=sku_result, c={"skuId": sku_result['skuId']}) else: sku_result['stockid'] = "no_match" + str(count) sku_result['SpiderDate'] = time_ago(minutes=60) sku_result['need_to_update'] = 1 ms.insert(t="prices_tb", d=sku_result) count += 1 logger.info(sku_result) del ms await self._goto_the_next()
async def get_page(self): await self.page.bringToFront() results = MySql.cls_get_dict(t="tb_order_spider", cn=["detailURL", "orderNo"], c={ "isDetaildown": 0, "fromStore": self.fromStore, }, o=["createTime"], om="d") for result in results: ms = MySql() tb_order_item = TBOrderItem(**result) logger.info(store_trans(self.fromStore)) logger.info("开始订单 " + result["orderNo"] + " 详情爬取") while 1: try: await self.page.goto(tb_order_item.detailURL) except errors.PageError: return 1 except errors.TimeoutError: return 1 else: break try: await self.page.waitForSelector('#detail-panel', timeout=30000) except errors.TimeoutError: await self.login.slider(self.page) is_logout = re.search(r"login.taobao.com", self.page.url) if is_logout: logger.info("登陆状态超时") return 1 continue content = await self.page.content() a = re.search(r"var data = JSON.parse\('(.*)'\);", content).group(1) # a = a.encode("").decode("unicode_escape") b = a.replace('\\\\\\"', '') data = b.replace('\\"', '"') m = json.loads(data) tb_order_item.actualFee = jsonpath(m, '$..actualFee.value')[0] tb_order_item.deliverFee = re.findall('\(快递:(\d+\.\d+)', str(m))[0] tb_order_item.orderStatus = status_format( jsonpath(m, '$..statusInfo.text')[0]) if tb_order_item.orderStatus == '等待买家付款': tb_order_item.isDetaildown = 2 else: tb_order_item.isDetaildown = 1 tb_order_item.couponPrice = await self.get_coupon(m) if jsonpath(m, '$..buyMessage'): tb_order_item.buyerComments = jsonpath(m, '$..buyMessage')[0] orderNo = m['mainOrder']['id'] order_info = m['mainOrder']['orderInfo']['lines'][1]['content'] for i in range(len(order_info)): if order_info[i]['value']['name'] == '支付宝交易号:': try: tb_order_item.tradeNo = order_info[i]['value']['value'] except KeyError: tb_order_item.tradeNo = None elif order_info[i]['value']['name'] == '创建时间:': tb_order_item.createTime = order_info[i]['value']['value'] # elif order_info[i]['value']['name'] == '发货时间:': # tb_order_item = order_info[i]['value']['value'] elif order_info[i]['value']['name'] == '付款时间:': tb_order_item.payTime = order_info[i]['value']['value'] if jsonpath(m, '$..logisticsName'): tb_order_item.shippingCompany = jsonpath( m, '$..logisticsName')[0] tb_order_item.shippingMethod = jsonpath(m, '$..shipType')[0] tb_order_item.shippingNo = jsonpath(m, '$..logisticsNum')[0] rec_info = jsonpath(m, '$..tabs..address')[0] tb_order_item.receiverName = rec_info.split(",")[0].replace( " ", "") tb_order_item.receiverPhone = rec_info.split(",")[1] tb_order_item.receiverAddress = "".join(rec_info.split(",")[2:]) tb_order_item.save(ms) sub_orders = m['mainOrder']['subOrders'] for i in range(len(sub_orders)): tb_order_detail_item = TBOrderDetailItem(orderNo=orderNo, itemNo=i) tb_order_detail_item.unitBenefits = 0 if sub_orders[i]['promotionInfo']: for j in sub_orders[i]['promotionInfo']: for x in j['content']: for k, v in x.items(): if k == 'value': f_prom = re.match("Exercise", v) p_list = re.findall("-?\d+\.\d+", v) if p_list and not f_prom: tb_order_detail_item.unitBenefits += float( p_list.pop()) tb_order_detail_item.save(ms) del ms await my_async_sleep(seconds=15, random_sleep=True) verify()
ms.insert(t=self._table_name(), d=data) def save_to_record(self, ms): start = 0 limit = 1000 while 1: column_name = self.__dict__.keys() res = ms.get(t=self._table_name(), cn=list(column_name), l=[str(start * limit), str(limit)]) if not res: break d = [] for r in res: a = tuple(str(x) for x in r) d.append(str(a)) sql = "insert into tb_master_record " + str( tuple(column_name)).replace("'", "") + " values " + ",".join(d) ms.insert(sql=sql) start += 1 if __name__ == '__main__': from db.my_sql import MySql from settings import TEST_SERVER_DB_TEST ms = MySql(db_setting=TEST_SERVER_DB_TEST) t = TBMasterItem() t.save_to_record(ms) pass