async def get_flag_text(self, data_url): page = self._page_seller_flag net_check() while 1: try: await page.bringToFront() await page.goto(data_url) except errors.TimeoutError: sleep(5) except errors.PageError: sleep(5) else: break await asyncio.sleep(1) content = await page.content() await asyncio.sleep(2) # await page.close() await self.page.bringToFront() doc = pq(content) res = re.search('"tip":"(.*?)"}', doc("pre").text()) if res: return res.group(1) else: logger.info(doc("pre").text()) return None
def connection(**kwargs): if kwargs: db_settings = kwargs else: db_settings = SQL_SETTINGS while True: try: con = pymysql.connect(**db_settings) except OperationalError as e: logger.error("数据库链接异常,1分钟后尝试重连,原因:" + str(e)) sleep() else: break cursor = con.cursor() cursor_dict = con.cursor(cursor=pymysql.cursors.DictCursor) return con, cursor, cursor_dict
async def login(self): p = await self.get_page() while 1: try: await p.goto("https://login.taobao.com", timeout=30000) except errors.PageError: logger.warning("网络异常5秒后重连") sleep(5) except errors.TimeoutError: logger.warning("网络异常5秒后重连") sleep(5) else: break ms = await p.J(".module-static") if ms: ls = await p.J(".login-switch") box = await ls.boundingBox() await p.mouse.click(box['x'] + 10, box['y']) while 1: try: await p.waitForSelector("#J_QRCodeImg") image = await p.J("#J_QRCodeImg") await image.screenshot({'path': './qrcode.png'}) except errors.NetworkError as e: # logger.warning(str(e)) pass else: break if LINUX: mail_pic(pic_mail_recevier.split(",")) else: logger.info("扫码登陆") qrcode = mpimg.imread('qrcode.png') # 读取和代码处于同一目录下的 qrcode.png plt.imshow(qrcode) # 显示图片 plt.axis('off') # 不显示坐标轴 plt.show() f = await self.phone_verify(p) return self.b, p, f
async def link_spider(self, p, f): test_server["db"] = "test" while True: sql = """ SELECT a.id,url,goodsCode,a.orderNo FROM tb_order_detail_spider a JOIN tb_order_spider b ON a.`orderNo`=b.`orderNo` WHERE link_id="1" AND b.`fromStore`='%s' AND a.url IS NOT NULL ORDER BY b.createTime DESC LIMIT 1 """ % (f) url = "https://smf.taobao.com/promotionmonitor/orderPromotionQuery.htm?orderNo=" results = mysql.get_data(sql=sql, dict_result=True) if not results: break orderno = results[0]['orderNo'] url += orderno await p.goto(url) content = await p.content() data = re.findall(">(\{.*?\})<", content) order = json.loads(data[0]) try: sub_orders = order["data"]["subOrderViewDTOs"] except KeyError: continue for so in sub_orders: order_no = so["orderNoStr"] link_id = so["itemId"] sql = "select goodsCode from tb_order_detail_spider where url like '%%%s%%'" % ( order_no) print(sql) goodsCode = mysql.get_data(sql=sql, return_one=True) del sql sql = "update tb_order_detail_spider set link_id='%s' where url like '%%%s%%'" % ( link_id, order_no) mysql.update_data(sql=sql) del sql sql = """ SELECT SpiderDate FROM prices_tb WHERE link_id='%s' AND stockid='%s' AND flag NOT IN ('del','XiaJia') """ % (link_id, goodsCode) res = mysql.get_data(sql=sql) res_fix = mysql.get_data(db=test_server, dict_result=True, t='prices_tb_fix', c={ "link_id": link_id, "server": "production_server" }) if res: spider_date = res[0][0] days = 1 if spider_date != '0000-00-00 00:00:00': days = (datetime.datetime.now() - spider_date).days if spider_date == '0000-00-00 00:00:00' or days > 14: if not res_fix: mysql.insert_data(db=test_server, t="prices_tb_fix", d={ "link_id": link_id, "fromStore": f, "flag": 1 }) elif res_fix[0]["isComplete"] != 0: mysql.update_data(db=test_server, t="prices_tb_fix", set={ "isComplete": 0, "flag": 1 }, c={ "link_id": link_id, "server": "production_server" }) else: if not res_fix: mysql.insert_data(db=test_server, t="prices_tb_fix", d={ "link_id": link_id, "fromStore": f, "flag": 0 }) elif res_fix[0]["isComplete"] != 0: mysql.update_data(db=test_server, t="prices_tb_fix", set={ "flag": 0, "isComplete": 0 }, c={ "link_id": link_id, "server": "production_server" }) sleep(5) await p.close() await self.run_link_spider()
async def login(self, **kwargs): p = await self.get_page(**kwargs) while 1: try: await p.goto("https://login.taobao.com", timeout=30000) except errors.PageError: logger.warning("网络异常5秒后重连") sleep(5) except errors.TimeoutError: logger.warning("网络异常5秒后重连") sleep(5) else: break while True: try: await p.waitForSelector(".forget-pwd.J_Quick2Static", visible=True, timeout=10000) await p.click(".forget-pwd.J_Quick2Static") except errors.TimeoutError: pass except errors.ElementHandleError: await p.reload() continue finally: try: await p.type('#TPL_username_1', kwargs['username'], {'delay': self.input_time_random() - 50}) await p.type('#TPL_password_1', kwargs['password'], {'delay': self.input_time_random()}) except errors.ElementHandleError: await p.reload() else: break net_check() # 检测页面是否有滑块。原理是检测页面元素。 try: await p.waitForSelector('#nc_1_n1z', visible=True, timeout=3000) except errors.TimeoutError: slider = 0 else: slider = await p.J('#nc_1_n1z') if slider: print("出现滑块情况判定") t = await self.slider(p=p) if t: return self.b, p, t await p.click("#J_SubmitStatic") # 调用page模拟点击登录按钮。 time.sleep(2) await self.get_cookie(p) else: await p.click("#J_SubmitStatic") try: await p.waitForNavigation() except errors.TimeoutError: pass print("登录成功") return self.b, p
async def next_page(self, page_num=1): """执行翻页""" temp = 0 while 1: t = time_zone(["08:00", "18:00", "23:00"]) a = datetime.datetime.now() if a < t[0]: if not temp: temp = 0 n_p_time = 600 elif t[0] < a < t[1]: temp += 1 if temp == 1: page_num = 1 n_p_time = NEXT_PAGE_TIME elif a > t[2]: n_p_time = 60 if not LINUX: subprocess.call("shutdown /s") exit("到点关机") else: n_p_time = 60 await self.page.bringToFront() if self.orderno: await self.page.focus("#bizOrderId") await asyncio.sleep(1) await self.page.keyboard.down("ShiftLeft") await asyncio.sleep(1) await self.page.keyboard.press("Home") await asyncio.sleep(1) await self.page.keyboard.down("ShiftLeft") await asyncio.sleep(1) await self.page.keyboard.press("Delete") await asyncio.sleep(1) orderno = input(time_now() + " | 输入订单号:") await self.page.type("#bizOrderId", orderno) await self.page.setRequestInterception(True) self.page.on('request', self.intercept_request) self.page.on('response', self.intercept_response) net_check() await self.page.click(".button-mod__primary___17-Uv") await asyncio.sleep(10) else: while 1: try: await self.page.waitForSelector( ".pagination-options-go") await self.page.focus(".pagination-options input") # await self.page.click(".pagination-options input", clickCount=2) await self.page.keyboard.press("Delete") await self.page.keyboard.press("Delete") await self.page.keyboard.press("Delete") await self.page.keyboard.press("Backspace") await self.page.keyboard.press("Backspace") await self.page.keyboard.press("Backspace") await self.page.setRequestInterception(True) self.page.on('request', self.intercept_request) self.page.on('response', self.intercept_response) net_check() await self.page.type(".pagination-options input", str(page_num)) await self.page.keyboard.press("Enter") self.page.waitForSelector( ".pagination-item.pagination-item-" + str(page_num) + ".pagination-item-active", timeout=10000) except errors.TimeoutError: logger.info('翻页超时,5秒后重新翻页') sleep(5) else: break # await self.page.waitForSelector(".pagination-item-" + str(page_num) + " a", timeout=30000) # await self.page.click(".pagination-item-" + str(page_num) + " a") while 1: if self.complete == 1: s = random.random() if s > 0.5: await self.link_spider() await self.order_page() logger.info(str(int(s * n_p_time)) + " 秒后开始下一页爬取") sleep(int(s * n_p_time)) break elif self.complete == 2: page_num = 0 s = random.random() if s > 0.9: mysql.update_data(t="tb_order_spider", set={"isDetaildown": 0}, c={ "isDetaildown": 2, "fromStore": self.fromStore }) sleep(int(s * n_p_time)) break else: # if i == 59: # logger.info("超时") # await self.page.screenshot({'path': './headless-test-result.png'}) await asyncio.sleep(3) self.complete = 0 page_num += 1
async def order_page(self, browser_in=None, page_in=None): """爬取订单详情""" while 1: result = mysql.get_data(t="tb_order_spider", cn=["datailURL", "orderNo"], c={ "isDetaildown": 0, "fromStore": self.fromStore }, o=["createTime"], om="d") if result: logger.info("订单详情爬取") for url in result: start_time = datetime.datetime.now() logger.info(store_trans(self.fromStore)) logger.info("开始订单 " + url[1] + " 详情爬取") order = {} await self._page_order_detail.bringToFront() # if browser_in: # page = await browser_in.newPage() # else: # page = page_in page = self._page_order_detail while 1: try: await page.goto(url[0]) except errors.PageError: sleep(5) except errors.TimeoutError: sleep(5) else: break try: await page.waitForSelector('#detail-panel', timeout=30000) except errors.TimeoutError: continue content = await page.content() a = re.search("var data = JSON.parse\('(.*)'\);", content).group(1) b = a.replace('\\\\\\"', '') data = b.replace('\\"', '"') m = json.loads(data) order['actualFee'] = m['mainOrder']['payInfo'][ 'actualFee']['value'] order['orderStatus'] = status_format( m['mainOrder']['statusInfo']['text']) if order['orderStatus'] == '等待买家付款': order['isDetaildown'] = 2 else: order['isDetaildown'] = 1 coupon = 0 for k, v in m['mainOrder']['payInfo'].items(): if k == 'promotions': promotions = m['mainOrder']['payInfo'][ 'promotions'] for i in range(len(promotions)): if 'prefix' and 'suffix' in promotions[i]: coupon_temp = re.search( "(\d+\.\d+)", promotions[i]['value']) if coupon_temp: coupon += float(coupon_temp.group(1)) order['couponPrice'] = round(coupon, 2) for k, v in m.items(): if k == 'buyMessage': order['buyerComments'] = v orderNo = m['mainOrder']['id'] order_info = m['mainOrder']['orderInfo']['lines'][1][ 'content'] for i in range(len(order_info)): if order_info[i]['value']['name'] == '支付宝交易号:': try: order['tradeNo'] = order_info[i]['value'][ 'value'] except KeyError: order['tradeNo'] = None # elif order_info[i]['value']['name'] == '创建时间:': # order['createTime'] = order_info[i]['value']['value'] # elif order_info[i]['value']['name'] == '发货时间:': # order['shipTime'] = order_info[i]['value']['value'] elif order_info[i]['value']['name'] == '付款时间:': order['payTime'] = order_info[i]['value']['value'] ship_info = m['tabs'] for i in range(len(ship_info)): if ship_info[i]['id'] == 'logistics': temp = ship_info[i]['content'] for k, v in temp.items(): if k == 'logisticsName': order['shippingCompany'] = v elif k == 'shipType': order['shippingMethod'] = v elif k == 'logisticsNum': order['shippingNo'] = v # elif k == 'logisticsUrl': # order['shipUrl'] = "https" + v elif k == 'address': rec_info = v order['receiverName'] = rec_info.split( ",")[0].replace(" ", "") order['receiverPhone'] = rec_info.split( ",")[1] order['receiverAddress'] = "".join( rec_info.split(",")[2:]) sub_orders = m['mainOrder']['subOrders'] # print(len(sub_orders)) for i in range(len(sub_orders)): item = {} temp = 0 itemNo = i if sub_orders[i]['promotionInfo']: for j in sub_orders[i]['promotionInfo']: for x in j['content']: for k, v in x.items(): if k == 'value': p_list = re.findall( "-?\d+\.\d+", v) if p_list: temp += float(p_list.pop()) item['unitBenefits'] = temp mysql.update_data(t="tb_order_detail_spider", set=item, c={ 'orderNo': orderNo, 'itemNo': itemNo }) logger.info("详细订单状态更新成功") # print(item) # print(order) mysql.update_data(t="tb_order_spider", set=order, c={'orderNo': orderNo}) logger.info("订单状态更新成功") # if browser_in: # await page.close() await self.page.bringToFront() Verify() end_time = datetime.datetime.now() spend_time = end_time - start_time logger.info( str(spend_time.seconds) + " 秒完成订单 " + url[1] + " 详情爬取") while True: s = random.random() if s > 0.3: logger.info("休息 " + str(int(s * n_o_time)) + " 秒完开始下一单详情爬取") for i in range(int(s * n_o_time)): await asyncio.sleep(1) break else: logger.info("没有可以爬取的详情") break
def delete_data(**kwargs): try: type(kwargs["help"]) print("""delete_data()帮助文档: :参数 help: 输出帮助文档 :参数 sql: 直接使用传入的sql语句 :参数 t: 需要删除数据的表名,字符串 :参数 c: 删除数据的条件,字典类型 :参数 db: 数据库连接配置,字典类型 :参数 only_sql: 布尔类型,不返回查询结果只返回函数转化的sql语句,默认为False :返回值: 返回元组类型的查询结果""") return None except KeyError: pass try: assert type(kwargs["t"] ) is str and kwargs["t"] is not None, "t的数据类型必需是字符串,并且不能为空" table_name = kwargs["t"] except KeyError: pass try: assert type(kwargs["c"]) is dict, "c的数据类型必需是字典" condition = concat(kwargs['c'], " and ") except KeyError: pass try: assert type(kwargs["sql"]) is str, "sql的数据类型必需是字符串" sql = kwargs["sql"] except KeyError: sql = "delete from %s where %s" % (table_name, condition) try: assert type(kwargs["db"]) is dict, "db的数据类型必需是字典(dict)" db = kwargs["db"] except KeyError: db = {} try: assert type(kwargs["only_sql"]) is bool, "only_sql的数据类型必需是bool类型" if kwargs["only_sql"]: print(sql) return None except KeyError: pass con, cursor, cursor_dict = connection(**db) while True: try: con.ping(reconnect=True) cursor.execute(sql) except OperationalError as e: logger.error("数据库链接异常,5秒后尝试重连,原因:" + str(e)) sleep(5) except Error as e: logger.error("异常报错的sql语句:" + sql) logger.error("异常内容:" + str(e) + "|异常类型:" + str(type(e))) con.rollback() con.close() break else: con.commit() con.close() break
def insert_data(**kwargs): try: type(kwargs["help"]) print("""insert_data()帮助文档: :参数 help: 输出帮助文档 :参数 sql: 直接使用传入的sql语句 :参数 t: 需要写入的表名,字符串 :参数 d: 需要写入的数据,字典类型 :参数 db: 数据库连接配置,字典类型 :参数 only_sql: 布尔类型,不返回查询结果只返回函数转化的sql语句,默认为False :返回值: 返回元组类型的查询结果""") return None except KeyError: pass try: assert type(kwargs["t"] ) is str and kwargs["t"] is not None, "t的数据类型必需是字符串,并且不能为空" table_name = kwargs["t"] except KeyError: pass try: assert type(kwargs["d"]) is dict, "d的数据类型必需是字典类型" keys = ",".join(kwargs["d"].keys()) x = [] for k, v in kwargs["d"].items(): x.append(str(v)) values = "'" + "','".join(x) + "'" except KeyError: pass try: assert type(kwargs["sql"]) is str, "sql的数据类型必需是字符串" sql = kwargs["sql"] except KeyError: sql = "insert into %s(%s) values(%s)" % (table_name, keys, values) try: assert type(kwargs["db"]) is dict, "db的数据类型必需是字典(dict)" db = kwargs["db"] except KeyError: db = {} try: assert type(kwargs["only_sql"]) is bool, "only_sql的数据类型必需是bool类型" if kwargs["only_sql"]: print(sql) return None except KeyError: pass con, cursor, cursor_dict = connection(**db) while True: try: con.ping(reconnect=True) cursor.execute(sql) except OperationalError as e: logger.error("数据库链接异常,5秒后尝试重连,原因:" + str(e)) sleep(5) except Error as e: logger.error("异常报错的sql语句:" + sql) logger.error("异常内容:" + str(e) + "|异常类型:" + str(type(e))) con.rollback() con.close() break else: con.commit() con.close() break
def get_data(**kwargs): try: type(kwargs["help"]) return """get_data()帮助文档: :参数 help: 输出帮助文档 :参数 sql: 直接使用传入的sql语句 :参数 t: 需要查询的表名,字符串 :参数 l: 查询输出结果的条数,整数类型 :参数 o: 对结果集进行排序,列表类型 :参数 om: 排序的方式,默认是升序,默认值 a ,可选值 d :参数 g: 根据一个或多个列对结果集进行分组,列表类型 :参数 db: 数据库连接配置,字典类型 :参数 cn: 输出结果集的列名,列表类型 :参数 c: 查询条件,字典类型 :参数 only_sql: 布尔类型,不返回查询结果只返回函数转化的sql语句,默认为False :参数 dict_result: 布尔类型,将返回字典类型的查询结果,默认为False :参数 return_one: 布尔类型,只返回第一个值,如果没有反回空 :返回值: 返回元组类型的查询结果""" except KeyError: pass try: assert type(kwargs["t"] ) is str and kwargs["t"] is not None, "t的数据类型必需是字符串,并且不能为空" table_name = kwargs["t"] except KeyError: pass try: assert type(kwargs["l"]) is int, "l的数据类型必需是整形(int)" limit_str = " limit " + str(kwargs["l"]) except KeyError: limit_str = " limit 100" try: assert type(kwargs["o"]) is list, "o的数据类型必需是列表" order_by_str = " order by " + ",".join(kwargs["o"]) try: assert kwargs["om"] == "d" or kwargs[ "om"] == "a", "om的参数值必需是d或a,默认值a" if kwargs["om"] == "a": order_by_str += " " + "asc" elif kwargs["om"] == "d": order_by_str += " " + "desc" except KeyError: order_by_str += " " + "asc" except KeyError: order_by_str = "" try: assert type(kwargs["g"]) is list, "o的数据类型必需是列表" group_by_str = " group by " + ",".join(kwargs["g"]) except KeyError: group_by_str = "" try: assert type(kwargs["cn"]) is list, "cn的数据类型必需是列表" column_name_str = ",".join(kwargs["cn"]) except KeyError: column_name_str = "*" try: assert type(kwargs["c"]) is dict, "sc的数据类型必需是字典(dict)" condition = " where " + concat(kwargs["c"], "and ") except KeyError: condition = "" try: assert type(kwargs['sql']) is str, "sql的数据类型必需是字典(str)" sql = kwargs['sql'] except KeyError: sql = "select %s from %s%s%s%s%s" % (column_name_str, table_name, condition, group_by_str, order_by_str, limit_str) try: assert type(kwargs["db"]) is dict, "db的数据类型必需是字典(dict)" db = kwargs["db"] except KeyError: db = {} try: assert type(kwargs["only_sql"]) is bool, "only_sql的数据类型必需是bool类型" if kwargs["only_sql"]: print(sql) return None except KeyError: pass con, cursor, cursor_dict = connection(**db) try: assert type(kwargs["dict_result"]) is bool, "dict_result的数据类型必需是bool类型" if kwargs["dict_result"]: while True: try: con.ping(reconnect=True) cursor_dict.execute(sql) except OperationalError as e: logger.error("数据库链接异常,5秒后尝试重连,原因:" + str(e)) sleep(5) except Error as e: logger.error("异常报错的sql语句:" + sql) logger.error("异常内容:" + str(e) + "|异常类型:" + str(type(e))) return None else: result = cursor_dict.fetchall() con.close() break else: while True: try: con.ping(reconnect=True) cursor.execute(sql) except OperationalError as e: logger.error("数据库链接异常,5秒后尝试重连,原因:" + str(e)) sleep(5) except Error as e: logger.error("异常报错的sql语句:" + sql) logger.error("异常内容:" + str(e) + "|异常类型:" + str(type(e))) return None else: result = cursor.fetchall() con.close() break except KeyError: while True: try: con.ping(reconnect=True) cursor.execute(sql) except OperationalError as e: logger.error("数据库链接异常,5秒后尝试重连,原因:" + str(e)) sleep(5) except Error as e: logger.error("异常报错的sql语句:" + sql) logger.error("异常内容:" + str(e) + "|异常类型:" + str(type(e))) return None else: result = cursor.fetchall() con.close() break try: assert type(kwargs["return_one"]) is bool, "return_one的数据类型必需是bool类型" if kwargs['return_one']: if result: return result[0][0] else: return None else: return result except KeyError: return result
t1, t2, t3 = 1, 0, 0 elif t[1] < now < t[2] and t2 == 0: report_mail() t1, t2, t3 = 0, 1, 0 elif now > t[2] and t3 == 0: report_mail() t1, t2, t3 = 0, 0, 1 sql = """ SELECT fromStore FROM prices_tb_fix WHERE isComplete='0' GROUP BY fromStore ORDER BY COUNT(link_id) DESC """ ts = test_server.copy() ts['db'] = 'test' res = mysql.get_data(db=ts, sql=sql) if res: b, p, f = loop.run_until_complete( ss.login(**STORE_INFO[res[0][0]])) ptb = PriceTaoBao(ss, b, p, f) loop.run_until_complete(ptb.run()) loop.run_until_complete(p.close()) if len(res) == 1: loop.run_until_complete(b.close()) ss.b = None else: sleep(10) else: b, p, f = loop.run_until_complete(l.login()) ptb = PriceTaoBao(l, b, p, f) loop.run_until_complete(ptb.run())