def Verify(): l_orderNo = [] column_name = [ 'orderNo', 'deliverFee', 'actualFee', 'couponPrice', 'fromStore', 'orderStatus' ] condition = {'isVerify': '0', 'isDetaildown': '1'} # kwargs = {'isVerify': '2', 'isDetaildown': '1'} result = mysql.get_data(t="tb_order_spider", cn=column_name, c=condition) if result: for i in result: total = 0 orderNo = i[0] deliverFee = i[1] actualFee = i[2] couponPrice = i[3] fromStore = i[4] column_name = ['unitPrice', 'sellNum', 'unitBenefits'] condition = {'orderNo': orderNo} result2 = mysql.get_data(t="tb_order_detail_spider", cn=column_name, c=condition) for j in result2: unitPrice = j[0] sellNum = j[1] unitBenefits = j[2] total = total + unitPrice * sellNum - unitBenefits a = round(total, 3) + deliverFee - actualFee - couponPrice if int(a) != 0 and i[5] != '交易关闭': list_tmp = [] list_tmp.append(str(round(total, 2))) list_tmp.append(str(deliverFee)) list_tmp.append(str(actualFee)) list_tmp.append(str(couponPrice)) list_tmp.append(str(round(a, 2))) list_tmp.append(store_trans(fromStore)) list_tmp.append(orderNo) l_orderNo.append("|".join(list_tmp)) mysql.update_data(t="tb_order_spider", set={'isVerify': 2}, c={'orderNo': orderNo}) else: mysql.update_data(t="tb_order_spider", set={'isVerify': 1}, c={'orderNo': orderNo}) # print('没有异常数据,验证完成!') if l_orderNo: s = "\n".join(l_orderNo) # print(s) mail("数据异常报告", s, ["*****@*****.**"]) taobao_check()
def reports_in(self, fromStore, price): reports = {} reports['reports_type'] = '订单爬虫报告' reports['store_name'] = store_trans(fromStore) reports['reports_date'] = datetime.date.today() temp = reports.copy() res = self.sql_element.select_data("spider_reports", 1, *['count', 'price'], **reports) if res: reports['count'] = res[0][0] + 1 reports['price'] = res[0][1] + price self.sql_element.update_old_data("spider_reports", reports, temp) else: reports['count'] = 1 reports['price'] = price self.sql_element.insert_new_data("spider_reports", **reports)
def split_store(self, item): if item['fromStore'] == 'YK': string = store_trans('YK') + "\n" for i in range(3): string += self.title_list[i] + ':\n' for j in range(len(self.total_list[i])): pass print(string) elif item['fromStore'] == 'KY': pass elif item['fromStore'] == 'SC': pass elif item['fromStore'] == 'VP': pass elif item['fromStore'] == 'YJ': pass elif item['fromStore'] == 'TB': pass
async def order_page(self, browser_in=None, page_in=None): """爬取订单详情""" while 1: result = mysql.get_data(t="tb_order_spider", cn=["datailURL", "orderNo"], c={ "isDetaildown": 0, "fromStore": self.fromStore }, o=["createTime"], om="d") if result: logger.info("订单详情爬取") for url in result: start_time = datetime.datetime.now() logger.info(store_trans(self.fromStore)) logger.info("开始订单 " + url[1] + " 详情爬取") order = {} await self._page_order_detail.bringToFront() # if browser_in: # page = await browser_in.newPage() # else: # page = page_in page = self._page_order_detail while 1: try: await page.goto(url[0]) except errors.PageError: sleep(5) except errors.TimeoutError: sleep(5) else: break try: await page.waitForSelector('#detail-panel', timeout=30000) except errors.TimeoutError: continue content = await page.content() a = re.search("var data = JSON.parse\('(.*)'\);", content).group(1) b = a.replace('\\\\\\"', '') data = b.replace('\\"', '"') m = json.loads(data) order['actualFee'] = m['mainOrder']['payInfo'][ 'actualFee']['value'] order['orderStatus'] = status_format( m['mainOrder']['statusInfo']['text']) if order['orderStatus'] == '等待买家付款': order['isDetaildown'] = 2 else: order['isDetaildown'] = 1 coupon = 0 for k, v in m['mainOrder']['payInfo'].items(): if k == 'promotions': promotions = m['mainOrder']['payInfo'][ 'promotions'] for i in range(len(promotions)): if 'prefix' and 'suffix' in promotions[i]: coupon_temp = re.search( "(\d+\.\d+)", promotions[i]['value']) if coupon_temp: coupon += float(coupon_temp.group(1)) order['couponPrice'] = round(coupon, 2) for k, v in m.items(): if k == 'buyMessage': order['buyerComments'] = v orderNo = m['mainOrder']['id'] order_info = m['mainOrder']['orderInfo']['lines'][1][ 'content'] for i in range(len(order_info)): if order_info[i]['value']['name'] == '支付宝交易号:': try: order['tradeNo'] = order_info[i]['value'][ 'value'] except KeyError: order['tradeNo'] = None # elif order_info[i]['value']['name'] == '创建时间:': # order['createTime'] = order_info[i]['value']['value'] # elif order_info[i]['value']['name'] == '发货时间:': # order['shipTime'] = order_info[i]['value']['value'] elif order_info[i]['value']['name'] == '付款时间:': order['payTime'] = order_info[i]['value']['value'] ship_info = m['tabs'] for i in range(len(ship_info)): if ship_info[i]['id'] == 'logistics': temp = ship_info[i]['content'] for k, v in temp.items(): if k == 'logisticsName': order['shippingCompany'] = v elif k == 'shipType': order['shippingMethod'] = v elif k == 'logisticsNum': order['shippingNo'] = v # elif k == 'logisticsUrl': # order['shipUrl'] = "https" + v elif k == 'address': rec_info = v order['receiverName'] = rec_info.split( ",")[0].replace(" ", "") order['receiverPhone'] = rec_info.split( ",")[1] order['receiverAddress'] = "".join( rec_info.split(",")[2:]) sub_orders = m['mainOrder']['subOrders'] # print(len(sub_orders)) for i in range(len(sub_orders)): item = {} temp = 0 itemNo = i if sub_orders[i]['promotionInfo']: for j in sub_orders[i]['promotionInfo']: for x in j['content']: for k, v in x.items(): if k == 'value': p_list = re.findall( "-?\d+\.\d+", v) if p_list: temp += float(p_list.pop()) item['unitBenefits'] = temp mysql.update_data(t="tb_order_detail_spider", set=item, c={ 'orderNo': orderNo, 'itemNo': itemNo }) logger.info("详细订单状态更新成功") # print(item) # print(order) mysql.update_data(t="tb_order_spider", set=order, c={'orderNo': orderNo}) logger.info("订单状态更新成功") # if browser_in: # await page.close() await self.page.bringToFront() Verify() end_time = datetime.datetime.now() spend_time = end_time - start_time logger.info( str(spend_time.seconds) + " 秒完成订单 " + url[1] + " 详情爬取") while True: s = random.random() if s > 0.3: logger.info("休息 " + str(int(s * n_o_time)) + " 秒完开始下一单详情爬取") for i in range(int(s * n_o_time)): await asyncio.sleep(1) break else: logger.info("没有可以爬取的详情") break
async def parse(self, mainOrders, pageNum): """解析爬取内容信息""" t = time_zone(["08:00", "18:00", "23:59"]) a = datetime.datetime.now() if a < t[0]: eoc = EARLIEST_ORDER_CREATETIME elif t[0] < a < t[1]: eoc = 2 else: eoc = 20 start_time = datetime.datetime.now() logger.info("开始第 " + str(pageNum) + " 页订单爬取") logger.info(store_trans(self.fromStore)) if pageNum == 1: self._loop_start_time = datetime.datetime.now() loop_control = 0 for i in range(len(mainOrders)): order = {} # 用于存储订单详细信息 order['orderNo'] = mainOrders[i]["id"] order['createTime'] = mainOrders[i]['orderInfo']['createTime'] order['buyerName'] = mainOrders[i]['buyer']['nick'] flag = mainOrders[i]['extra']['sellerFlag'] order['actualFee'] = mainOrders[i]['payInfo']['actualFee'] order['deliverFee'] = re.search( "\(含快递:¥(\d+\.\d+)\)", mainOrders[i]['payInfo']['postType']).group(1) order['datailURL'] = "https:" + mainOrders[i]['statusInfo'][ 'operations'][0]['url'] order['orderStatus'] = mainOrders[i]['statusInfo']['text'] order['fromStore'] = self.fromStore order['updateTime'] = time_now() if flag == 1: data_url = self.base_url + mainOrders[i]['operations'][0][ 'dataUrl'] order['sellerFlag'] = await self.get_flag_text(data_url) try: order['isPhoneOrder'] = mainOrders[i]['payInfo']['icons'][0][ 'linkTitle'] except KeyError: pass items = mainOrders[i]['subOrders'] line_no = 0 for j in range(len(items)): continue_code = 0 item = {} # 用于存储售出商品详细信息 item['orderNo'] = mainOrders[i]["id"] item['itemNo'] = line_no try: item['goodsCode'] = items[j]['itemInfo']['extra'][0][ 'value'] except KeyError: item['goodsCode'] = 'error' logger.error(time_now() + " 订单:" + item['orderNo']) item['tbName'] = items[j]['itemInfo']['title'].strip() \ .replace("±", "±").replace("Φ", "Φ").replace("Ω", "Ω") \ .replace("—", "—").replace("°", "°").replace("×", "×") \ .replace("μ", "μ").replace(" ", "").replace("(", "(").replace(")", ")") item['unitPrice'] = items[j]['priceInfo']['realTotal'] item['sellNum'] = items[j]['quantity'] item['orderStatus'] = order['orderStatus'] if self.orderno: logger.info(item['orderStatus']) item['refundStatus'] = None item['isRefund'] = 0 item['goodsAttribute'] = "" item['url'] = "https:" + items[j]['itemInfo']['itemUrl'] try: goodsAttributes = items[j]['itemInfo']['skuText'] except KeyError: pass else: temp = [] for k in range(len(goodsAttributes)): try: goodsAttributes[k]['name'] except KeyError: n = len(temp) temp[n - 1] += goodsAttributes[k]['value'].replace( "&Omega", "Ω").replace("·", "·") else: temp.append(goodsAttributes[k]['value'].replace( "&Omega", "Ω").replace("·", "·")) temp_ga = "-".join(temp) item['goodsAttribute'] = temp_ga.replace("(", "(").replace( ")", ")") try: operations = items[j]['operations'] except KeyError: pass else: for x in range(len(operations)): t = operations[x]['style'] if t in ['t12', 't16' ] and operations[x]['text'] != "退运保险": item['refundStatus'] = operations[x]['text'] item['isRefund'] = "1" elif t == 't0' and operations[x]['text'] == '已取消': continue_code = 1 delete_item = { 'orderNo': item['orderNo'], 'itemNo': item['itemNo'], 'goodsCode': item['goodsCode'] } is_exist = mysql.get_data( t="tb_order_detail_spider", l=1, c=delete_item) if is_exist: mysql.delete_data(t="tb_order_detail_spider", c=delete_item) sql = """ UPDATE tb_order_detail_spider SET itemNo=itemNo-1 WHERE OrderNo='%s' and itemNo>'%s' """ % (item['orderNo'], item['itemNo']) mysql.update_data(sql=sql) pass if continue_code: continue else: line_no += 1 self.save_in_sql(item=item, tableName='tb_order_detail_spider') self.save_in_sql(item=order, tableName='tb_order_spider') if self.orderno: logger.info("定向爬取订单完成") return date = datetime.date.today() date_limit = ( date - datetime.timedelta(eoc)).strftime("%Y-%m-%d %H:%M:%S") if order['createTime'] < date_limit: logger.info("完成本轮爬取,共翻 " + str(pageNum) + " 页。") loop_control = 1 break end_time = datetime.datetime.now() spend_time = end_time - start_time logger.info( str(spend_time.seconds) + " 秒完成第 " + str(pageNum) + " 页订单爬取") if loop_control: self._loop_end_time = datetime.datetime.now() loop_spend_time = round( (self._loop_end_time - self._loop_start_time).seconds / 60, 0) logger.info(str(loop_spend_time) + " 分钟完成本轮订单爬取") self.complete = 2 else: self.complete = 1
async def parse_2(self, data): verify = [] repeat_list = [] for i in data['skuOuterIdTable']['dataSource']: self.item['stockid'] = i['skuOuterId'] logger.info(self.item['stockid']) if not self.item['stockid']: continue else: if self.item['stockid'] not in verify: verify.append(self.item['stockid']) else: if self.item['stockid'] not in repeat_list: repeat_list.append(self.item['stockid']) skuId = str(i['skuId']) temp_attr = self.prop.get(skuId) self.item['attribute'] = temp_attr.replace("(", "(").replace(")", ")") if not self.item['attribute']: self.item.pop('attribute') self.item['price_tb'] = self.prices.get(skuId) if self.promo_price: self.item["promotionprice"] = self.promo_price.get(skuId) else: self.item["promotionprice"] = 0 condition = { "stockid": self.item['stockid'], "link_id": self.item['link_id'], "shop_id": self.item['shop_id'], } res = mysql.get_data(t="prices_tb", l=1, cn=["price_tb"], c=condition, db=self.target_server) if res: if res[0][0] == 0: self.item['ratio'] = 1 else: self.item['ratio'] = round( float(self.item['price_tb']) / float(res[0][0]), 2) print(self.item) mysql.update_data(t="prices_tb", set=self.item, c=condition, db=self.target_server) else: insert_item = self.item.copy() insert_item["currabrev"] = "CNY" insert_item["price_erp"] = 0 insert_item["operator"] = self.operator insert_item["last_time"] = time_now() if self.operator == "爬虫维护": insert_item["flag"] = "create" else: insert_item['flag'] = "add" insert_item["ratio"] = 1 insert_item["package_number"] = 1 insert_item["Checker"] = "" insert_item["CheckDate"] = "0000-00-00 00:00:00" print(insert_item) with open("reports/report_" + self.fromStore + "_insert.txt", "a") as file: file.write("物料编码:" + insert_item['stockid'] + " 与商品ID:" + insert_item['link_id'] + " 为最新匹配,添加至ERP系统。\n" + self.item_url + insert_item['link_id'] + "\n" + self.item_erp_url + insert_item['link_id'] + "\n\n") mysql.insert_data(t="prices_tb", d=insert_item, db=self.target_server) if repeat_list: with open("reports/report_" + self.fromStore + "_repeat.txt", "a") as file: file.write("店铺:" + store_trans(self.fromStore) + ",商品id:" + self.item['link_id'] + " 重复编码\n" + "重复编码:" + ",".join(repeat_list) + "\n" + self.item_url + self.item['link_id'] + "\n\n") if not verify: with open("reports/report_" + self.fromStore + "_empty.txt", "a") as file: file.write("店铺:" + store_trans(self.fromStore) + ",商品id:" + self.item['link_id'] + " 空编码\n" + self.item_url + self.item['link_id'] + "\n\n") sql = """ select id,stockid from prices_tb where link_id='%s' and flag not in('del','XiaJia') """ % (self.item['link_id']) res_verify = mysql.get_data(sql=sql, db=self.target_server) for rv in res_verify: if rv[1] not in verify: with open("reports/report_" + self.fromStore + "_delete.txt", "a") as file: file.write("物料编码:" + rv[1] + " 与 商品ID:" + self.item['link_id'] + " 不匹配,已被爬虫从ERP系统中删除。\n" + self.item_url + self.item['link_id'] + "\n" + self.item_erp_url + self.item['link_id'] + "\n\n") mysql.update_data(t="prices_tb", c={"id": rv[0]}, db=self.target_server, set={ "flag": "del", "operator": self.operator, "last_time": time_now() }) self.complete = 1