示例#1
0
async def taks_1(browser, delay_order_spider, detail_page_spider, manager_page_spider, from_store, link_id_spider,
                 list_page_spider):
    page_num = 1
    while 1:
        try:
            completed = await list_page_spider.get_page(page_num)
            if completed == 1:
                page_num += 1
            elif completed == 2:
                MySql.cls_update(t="tb_order_spider", set={"isDetaildown": 0},
                                 c={"isDetaildown": 2, "fromStore": from_store})
                MySql.cls_update(t="tb_order_spider", set={"isVerify": 0},
                                 c={"isVerify": 2, "fromStore": from_store})
                page_num = 1
            elif completed == 'exit':
                break
            await my_async_sleep(20, random_sleep=True)
            await link_id_spider.save_link_id()
            await manager_page_spider.do_it()
            await detail_page_spider.get_page()
            exit_loop = await delay_order_spider.get_page()
            if exit_loop == 'exit':
                break
        except Exception as e:
            logger.error(str(e))
            break
    await browser.close()
示例#2
0
    def get(self, shop_ids):
        mail_container = self.init_receivers()
        ms = MySql(db_setting=TEST_SERVER_DB_TEST)

        for shop_id in shop_ids:
            shop_name = MySql.cls_get_one(
                sql="SELECT shopname FROM shop_info WHERE shop_id={}".format(
                    shop_id))
            flag_report_groups = []
            # sql = "SELECT COUNT(id) AS nums FROM tb_master where shop_id='{}' and update_date<'{}'".format(shop_id,
            #                                                                                                datetime.date.today())
            # nums = mysql.get_data(db=test_server, sql=sql, return_one=True)
            # if int(nums) > 0:
            #     flag_report_groups.append("下架商品{}条".format(nums))
            # del sql
            # del nums
            for flag in self.translate_dictionary.keys():
                sql = "SELECT COUNT(id) AS nums FROM tb_master WHERE flag LIKE '%%{}%%' AND shop_id='{}'".format(
                    flag, shop_id)
                nums = ms.get_one(db=TEST_SERVER_DB_TEST, sql=sql)
                if int(nums) > 0:
                    flag_report_groups.append("{}{}条".format(
                        self.translate_dictionary[flag], nums))

            if flag_report_groups:
                result = ms.get_dict(t="tb_search_page_info",
                                     c={"shop_id": shop_id})
                flag_report_groups.append("总计爬取{}页".format(
                    result[0]['total_page']))
                flag_report_groups.append("总计花费{}分{}秒".format(
                    int(result[0]['spent_time'] / 60),
                    int(result[0]['spent_time'] % 60)))
                flag_report_groups.reverse()
                flag_report_groups.append(shop_name)
                flag_report_groups.reverse()

                if shop_id in ["115443253", "33817767"]:
                    mail_container["KY"]['mail_content'] += "|".join(
                        flag_report_groups) + "\n"
                    mail_container["KY"]['mail_content'] += self.insert_link(
                        shop_id, ms)
                elif shop_id in ["34933991", "131282813"]:
                    mail_container["TB"]['mail_content'] += "|".join(
                        flag_report_groups) + "\n"
                    mail_container["TB"]['mail_content'] += self.insert_link(
                        shop_id, ms)
                elif shop_id in ["68559944", "60299985"]:
                    mail_container["YJ"]['mail_content'] += "|".join(
                        flag_report_groups) + "\n"
                    mail_container["YJ"]['mail_content'] += self.insert_link(
                        shop_id, ms)
                else:
                    mail_container["YK"]['mail_content'] += "|".join(
                        flag_report_groups) + "\n"
                    mail_container["YK"]['mail_content'] += self.insert_link(
                        shop_id, ms)
        return mail_container
示例#3
0
 async def input_verify_code(self, frame, fromStore, type):
     logger.info("需要要手机验证码")
     ms = MySql(db_setting=TEST_SERVER_DB_TEST)
     ms.delete(t='phone_verify', c={'fromStore': fromStore})
     ms.insert(t="phone_verify", d={"fromStore": fromStore})
     mail(fromStore + "手机验证码", fromStore + "登陆需要手机验证码", MAIL_RECEIVERS)
     verify_code = "0"
     while 1:
         if type == 0:
             await frame.click(PHONE_GET_CODE[0])
         else:
             await frame.click(PHONE_GET_CODE[1])
         for i in range(120):
             await asyncio.sleep(5)
             verify_code = ms.get_one(t='phone_verify',
                                      cn=['verify_code'],
                                      c={"fromStore": fromStore})
             if verify_code != "0":
                 ms.delete(t='phone_verify', c={'fromStore': fromStore})
                 del ms
                 break
         if verify_code != "0":
             break
         await asyncio.sleep(10)
     if type == 0:
         await frame.type(PHONE_CHECK_INPUT[0], verify_code,
                          {'delay': self.input_time_random() - 50})
         await frame.click(PHONE_SUBMIT_BTN[0])
     else:
         await frame.type(PHONE_CHECK_INPUT[1], verify_code,
                          {'delay': self.input_time_random() - 50})
         await frame.click(PHONE_SUBMIT_BTN[1])
示例#4
0
async def run():
    while 1:
        update()
        ms = MySql(db_setting=TEST_SERVER_DB_TEST)
        ms.update(t="spider_monitor",
                  set={"latest_time": time_now()},
                  c={"spider_address": SPIDER_ADDRESS})
        restart_signal = ms.get_one(t="spider_monitor",
                                    cn=["restart_signal"],
                                    c={"spider_address": SPIDER_ADDRESS})
        if SPIDER_ADDRESS == "3_floor":
            sql = "SELECT MAX(updateTime) as updateTime,fromStore FROM tb_order_spider WHERE fromStore IN ('KY','TB') GROUP BY fromStore"
        else:
            sql = "SELECT MAX(updateTime) as updateTime,fromStore FROM tb_order_spider WHERE fromStore IN ('YJ','YK') GROUP BY fromStore"
        results = MySql.cls_get_dict(sql=sql)
        t = time_ago(minutes=15)
        for result in results:
            if str(result['updateTime']) < t:
                restart_signal = 1
                break
        if restart_signal:
            ms.update(t="spider_monitor",
                      set={"restart_signal": 0},
                      c={"spider_address": SPIDER_ADDRESS})
            restart()
        del ms
        await asyncio.sleep(60)
示例#5
0
def update():
    ms = MySql(db_setting=TEST_SERVER_DB_TEST)
    update_signals = ms.get_dict(t="spider_monitor",
                                 cn=["spider_address", "update_signal"])
    for update_signal in update_signals:
        if update_signal['update_signal']:
            if SPIDER_ADDRESS == update_signal['spider_address']:
                result = check_output(['git', 'pull'])
                ms.update(t='spider_monitor',
                          set={
                              "update_signal": 0,
                              "update_result": result.decode('utf-8').strip()
                          },
                          c={"spider_address": SPIDER_ADDRESS})
示例#6
0
 async def intercept_request(req):
     if re.search(r'https://item.taobao.com/item.htm', req.url):
         await req.continue_()
     elif re.search('item.taobao.com.*?noitem.htm.*?', req.url):
         link_id = re.findall("itemid=(\d+)", req.url)[0]
         MySql.cls_update(db_setting=TEST_SERVER_DB_TEST,
                          t="tb_master",
                          set={
                              "flag": "XiaJia",
                              "isUsed": 1
                          },
                          c={"link_id": link_id})
         await req.abort()
     else:
         await req.abort()
示例#7
0
 def _get_item():
     column_name = [
         "shop_id",
         "link_id",
         "description",
         "price_tb",
         "promotionprice",
         "sales",
         "rates",
     ]
     while 1:
         results = MySql.cls_get_dict(db_setting=TEST_SERVER_DB_TEST,
                                      t="tb_master",
                                      c={
                                          "isUsed": 0,
                                          "isMut": 1,
                                          "flag!": "XiaJia"
                                      },
                                      cn=column_name,
                                      l=["0", "1"])
         if results:
             results[0]['price_tb'] = float(results[0]['price_tb'])
             results[0]['promotionprice'] = float(
                 results[0]['promotionprice'])
             results[0]['typeabbrev'] = store_trans(results[0]['shop_id'],
                                                    'id_2_code')
             return results[0]
         else:
             logger.info('没有数据需要爬取!')
             my_sleep()
    async def parse(self, main_orders, page_num):
        # print(main_orders)
        ms = MySql()
        t = time_zone(["08:00", "23:00", "23:59"])
        a = datetime.datetime.now()
        if a < t[0]:
            eoc = EARLIEST_ORDER_CREATE_TIME
        elif t[0] < a < t[1]:
            eoc = 2
        else:
            eoc = 60

        for i in range(len(main_orders)):
            continue_code = 0  # 有些订单的商品,在未付款时就已经退掉了,所以直接直接将数据进行删除
            # 解析并保存订单到数据库
            sub_orders, tb_order_item = await self.parse_order_item(
                i, main_orders, ms)
            if not sub_orders:
                return
                # 解析并保存订单详细商品到数据库
            await self.parse_order_detail_item(continue_code, i, main_orders,
                                               sub_orders, tb_order_item, ms)

            date = datetime.date.today()
            date_limit = (
                date - datetime.timedelta(eoc)).strftime("%Y-%m-%d %H:%M:%S")
            if tb_order_item.createTime < date_limit:
                logger.info("完成本轮爬取,共翻 " + str(page_num) + " 页。")
                self.completed = 2
                del ms
                return
        self.completed = 1
        del ms
示例#9
0
def verify():
    l_orderNo = []
    column_name = [
        'orderNo', 'deliverFee', 'actualFee', 'couponPrice', 'fromStore',
        'orderStatus'
    ]
    condition = {'isVerify': '0', 'isDetaildown': '1'}
    # kwargs = {'isVerify': '2', 'isDetaildown': '1'}
    ms = MySql()
    result = ms.get(t="tb_order_spider", cn=column_name, c=condition)
    if result:
        for i in result:
            total = 0
            orderNo = i[0]
            deliverFee = i[1]
            actualFee = i[2]
            couponPrice = i[3]
            fromStore = i[4]
            column_name = ['unitPrice', 'sellNum', 'unitBenefits']
            condition = {'orderNo': orderNo}
            result2 = ms.get(t="tb_order_detail_spider",
                             cn=column_name,
                             c=condition)
            for j in result2:
                unitPrice = j[0]
                sellNum = j[1]
                unitBenefits = j[2]
                total = total + unitPrice * sellNum - unitBenefits
            a = round(total, 3) + deliverFee - actualFee - couponPrice
            if abs(a) > 0.0001 and i[5] != '交易关闭':
                list_tmp = []
                list_tmp.append(str(round(total, 2)))
                list_tmp.append(str(deliverFee))
                list_tmp.append(str(actualFee))
                list_tmp.append(str(couponPrice))
                list_tmp.append(str(a))
                list_tmp.append(store_trans(fromStore))
                list_tmp.append(orderNo)
                l_orderNo.append("|".join(list_tmp))
                ms.update(t="tb_order_spider",
                          set={
                              'isVerify': 2,
                              'isDetaildown': 0
                          },
                          c={'orderNo': orderNo})
            else:
                ms.update(t="tb_order_spider",
                          set={'isVerify': 1},
                          c={'orderNo': orderNo})
                # print('没有异常数据,验证完成!')
                pass
    if l_orderNo:
        s = "\n".join(l_orderNo)
        # print(s)
        mail("数据异常报告", s, ["*****@*****.**"])
示例#10
0
 def _get_curls(shop_id):
     curls = []
     results = MySql.cls_get_dict(db_setting=test_db,
                                  t="tb_search_curl",
                                  c={'shop_id': shop_id})
     for res in results:
         curls.append(res)
     if curls:
         return random.choice(curls)
     else:
         return 0
 async def run(cls, login, browser, page, from_store):
     page_num = 1
     list_spider = OrderListPageSpider(login, browser, page, from_store)
     while 1:
         completed = await list_spider.get_page(page_num)
         if completed == 1:
             page_num += 1
         elif completed == 2:
             MySql.cls_update(t="tb_order_spider",
                              set={"isDetaildown": 0},
                              c={
                                  "isDetaildown": 2,
                                  "fromStore": from_store
                              })
             MySql.cls_update(t="tb_order_spider",
                              set={"isVerify": 0},
                              c={
                                  "isVerify": 2,
                                  "fromStore": from_store
                              })
             page_num = 1
         await my_async_sleep(15, random_sleep=True)
示例#12
0
    def _get_page_num(shop_id):
        #  从数据库得到数据
        ms = MySql(db_setting=test_db)
        result = ms.get_dict(t="tb_search_page_info", c={"shop_id": shop_id})
        if not result:
            #  没有数据就新增一个默认数据
            d = {
                "shop_id": shop_id,
                "total_page": 20,
                "used_page_nums": "0",
                "last_date": datetime.date.today(),
                "spent_time": 0
            }
            #  插入数据后再重新获取
            ms.insert(t="tb_search_page_info", d=d)
            result = ms.get_dict(t="tb_search_page_info",
                                 c={"shop_id": shop_id})

        if result[0]['last_date'] < datetime.date.today():
            ms.update(t="tb_search_page_info",
                      set={
                          "used_page_nums": "0",
                          "spent_time": 0
                      },
                      c={"shop_id": shop_id})
            result = ms.get_dict(t="tb_search_page_info",
                                 c={"shop_id": shop_id})
        #  获取已采集的数据的页码列表
        used_page_nums = [
            int(x) for x in result[0]['used_page_nums'].split(",")
        ]
        total_page = result[0]['total_page']
        set_a = set([i for i in range(total_page + 1)])  # 全部页码的set集合
        set_b = set(used_page_nums)  # 已采集的数据的页码集合
        list_result = list(set_a - set_b)  # 未采集数据的页码列表
        if list_result:
            # 返回一个随机的未采集数据的页码,已采集的页码集合,和总的页码数
            return random.choice(
                list_result
            ), used_page_nums, total_page, result[0]['spent_time']
        else:
            # 如果没有未采集的页码,则表示当前店铺的所有页码全部采集完成
            return 0, 0, 0, 0
 def _get_order_info(self):
     today = datetime.datetime.now()
     one_day = datetime.timedelta(minutes=60)
     earlier_15_minutes = today - one_day
     updateTime = earlier_15_minutes.strftime("%Y-%m-%d %H:%M:%S")
     payTime = yesterday("18:00:00")
     sql = """      
                            SELECT 
                            tos.orderNo,createTime
                            FROM tb_order_spider tos
                            WHERE  tos.updateTime<'{}'
                            AND tos.`orderStatus` = '买家已付款' 
                            AND tos.`fromStore` = '{}' 
                            AND tos.payTime<'{}'
                            ORDER BY updateTime;
                            """.format(updateTime, self.fromStore, payTime)
     res = MySql.cls_get_dict(sql=sql)
     order_no = None
     days = 0
     if res:
         order_no = res[0]['orderNo']
         days = (today - res[0]['createTime']).days
     return days, order_no
示例#14
0
    async def save_link_id(self):
        ms = MySql()
        link_id_new_list = []
        self.completed = 0
        sql = """SELECT url,a.orderNo FROM tb_order_detail_spider a
            JOIN tb_order_spider b ON a.`orderNo`=b.`orderNo`
            WHERE link_id="1" AND b.`fromStore`='{}' AND a.url IS NOT NULL
            GROUP BY a.orderNo
            ORDER BY b.createTime DESC""".format(self.fromStore)
        results = ms.get_dict(sql=sql)
        if results:
            for result in results:
                logger.info("link_id_spider-" + result['orderNo'])
                data = await self._get_json(result['orderNo'])
                if not data:
                    return 0
                sub_orders = data["data"]["subOrderViewDTOs"]
                for so in sub_orders:
                    price_tb_item = PriceTBItem()
                    price_tb_item.link_id = so["itemId"]

                    order_no = so["orderNoStr"]
                    sql = "select * from tb_order_detail_spider where url like '%%{}%%'".format(
                        order_no)
                    res = ms.get_dict(sql=sql)[0]

                    price_tb_item.stockid = res['goodsCode']
                    price_tb_item.description = res['tbName']
                    price_tb_item.price_tb = res['unitPrice']
                    price_tb_item.shop_id = store_trans(string=self.fromStore,
                                                        action="code_2_id")
                    price_tb_item.attribute = res['goodsAttribute']
                    price_tb_item.typeabbrev = self.fromStore
                    sql = "update tb_order_detail_spider set link_id='{}' where url like '%{}%'".format(
                        price_tb_item.link_id, order_no)
                    ms.update(sql=sql)
                    price_tb_item.save(ms)
                    await my_async_sleep(3, True)
    async def do_it(self):
        shop_id = store_trans(self.fromStore, 'code_2_id')
        ms = MySql()
        sql = "select link_id from prices_tb where need_to_update=1 and shop_id='{}' limit 1".format(
            shop_id)
        link_id = ms.get_one(sql=sql)
        if not link_id:
            return 0
        await self.page.bringToFront()

        try:
            if not re.search(
                    "https://item.manager.taobao.com/taobao/manager/render.htm",
                    self.page.url):
                await self.page.goto(
                    "https://item.manager.taobao.com/taobao/manager/render.htm?tab=on_sale"
                )
        except Exception as e:
            logger.error(str(e) + "manager_page_error")
            return
        while 1:
            await self.page.waitForSelector("input[name='queryItemId']",
                                            timeout=0)
            await self.page.keyboard.press('Escape')
            await self.page.focus("input[name='queryItemId']")
            for _ in range(20):
                await self.page.keyboard.press("Delete")
                await self.page.keyboard.press("Backspace")
            await self.page.type("input[name='queryItemId']", str(link_id),
                                 {'delay': self.login.input_time_random()})
            await self.page.click(".filter-footer button:first-child")
            await self.page.waitForResponse(
                "https://item.manager.taobao.com/taobao/manager/table.htm")
            await asyncio.sleep(1)
            await self.listening(self.page)
            try:
                await self.page.waitForSelector(FAST_EDIT_BTN, timout=10000)
                await self.page.click(FAST_EDIT_BTN)
                restart = await self.login.slider(self.page)
                if restart:
                    exit("滑块验证码失败,退出")
            except errors.TimeoutError as e:
                logger.info("商品已下架,没有查询到对应的商品ID:" + link_id)
                ms.update(t="prices_tb",
                          set={
                              "SpiderDate": time_now(),
                              "need_to_update": 0,
                              "flag": "XiaJia"
                          },
                          c={"link_id": link_id})
                link_id = ms.get_one(sql=sql)
                if not link_id:
                    return 0
                continue
            else:
                await self.page.focus("input[name='queryItemId']")
                for _ in range(20):
                    await self.page.keyboard.press("Delete")
                    await self.page.keyboard.press("Backspace")
                break
        while 1:
            if self.completed == 4:
                break
            await asyncio.sleep(1)
        await asyncio.sleep(15)
    async def parse_item_page(self, content=None, detail=None, rate=None):

        if content:
            sku_map = re.search('skuMap.*?(\{.*)', content)
            shop_id = store_trans(self.fromStore, 'code_2_id')
            doc = PyQuery(content)
            items = doc("li[data-value]").items()
            logger.debug(items)
            attr_map = {}
            if items:
                for item in items:
                    attr_map[item.attr('data-value')] = item.find(
                        'span').text().replace("(", "(").replace(")", ")")
            if sku_map:
                sku_dict = json.loads(sku_map.group(1))
                for k, v in sku_dict.items():
                    for price_tb_item in self.price_tb_items:
                        if price_tb_item.skuId == v.get('skuId'):
                            price_tb_item.price_tb = v.get('price')
                            price_tb_item.shop_id = shop_id
                            price_tb_item.attribute_map = k
                            price_tb_item.attribute = "-".join([
                                attr_map.get(r)
                                for r in re.sub('^;|;$', "", k).split(";")
                            ])
            else:
                self.price_tb_items[0].shop_id = shop_id
                self.price_tb_items[0].price_tb = doc(
                    'input[name="current_price"]').val()
            self.completed = 2
        if detail:
            while 1:
                if self.completed == 2:
                    break
                await asyncio.sleep(1)
            logger.debug(detail)
            detail = re.sub(r'span class=\"wl-yen\"',
                            r'span class=\\"wl-yen\\"', detail)
            data = re.search('uccess\((.*?)\);', detail)
            if data:
                x = json.loads(data.group(1))
            else:
                await self.login.slider(self.item_page)
                return
            promo_data = jsonpath(x, '$..promoData')
            for price_tb_item in self.price_tb_items:
                price_tb_item.sales = jsonpath(x, '$..soldTotalCount')[0]
                price_tb_item.typeabbrev = self.fromStore
                if promo_data and promo_data[0]:
                    if price_tb_item.attribute_map:
                        for k, v in promo_data[0].items():
                            if k == price_tb_item.attribute_map:
                                price_tb_item.promotionprice = jsonpath(
                                    v, '$..price')[0]
                    else:
                        price_tb_item.promotionprice = jsonpath(
                            x, '$..promoData..price')[0]
            self.completed = 3
        if rate:
            while 1:
                if self.completed == 3:
                    break
                await asyncio.sleep(1)
            logger.debug(rate)
            ms = MySql()
            for price_tb_item in self.price_tb_items:
                count = re.search('count.*?(\d+)', rate)
                if count:
                    price_tb_item.rates = count.group(1)
                price_tb_item.need_to_update = 0
                price_tb_item.save(ms)
                # print(price_tb_item)
            self.price_tb_items[0].delete(ms)
            del ms
            self.completed = 4
    async def parse_order_detail_item(continue_code, i, main_orders,
                                      sub_orders, tb_order_item, ms):
        for j in range(len(sub_orders)):
            tb_order_detail_item = TBOrderDetailItem()
            tb_order_detail_item.orderNo = main_orders[i]["id"]
            tb_order_detail_item.itemNo = j
            try:
                tb_order_detail_item.goodsCode = sub_orders[j]['itemInfo'][
                    'extra'][0]['value']
            except KeyError:
                tb_order_detail_item.goodsCode = 'error'
            tb_order_detail_item.tbName = format_tb_name(
                sub_orders[j]['itemInfo']['title'])
            tb_order_detail_item.unitPrice = sub_orders[j]['priceInfo'][
                'realTotal']
            tb_order_detail_item.sellNum = sub_orders[j]['quantity']
            tb_order_detail_item.orderStatus = tb_order_item.orderStatus
            tb_order_detail_item.url = "https:" + sub_orders[j]['itemInfo'][
                'itemUrl']
            try:
                attribute_list = sub_orders[j]['itemInfo']['skuText']
            except KeyError:
                pass
            else:
                tb_order_detail_item.goodsAttribute = format_attribute(
                    attribute_list)

            try:
                operations = sub_orders[j]['operations']
            except KeyError:
                pass
            else:
                for x in range(len(operations)):
                    t = operations[x]['style']
                    if t in ['t12', 't16'] and operations[x]['text'] != "退运保险":
                        tb_order_detail_item.refundStatus = operations[x][
                            'text']
                        tb_order_detail_item.isRefund = "1"
                    elif t == 't0' and operations[x]['text'] == '已取消':
                        continue_code = 1
                        delete_item = {
                            'orderNo': tb_order_detail_item.orderNo,
                            'itemNo': tb_order_detail_item.itemNo,
                            'goodsCode': tb_order_detail_item.goodsCode
                        }
                        ms = MySql()
                        is_exist = ms.get(t="tb_order_detail_spider",
                                          l=1,
                                          c=delete_item)
                        if is_exist:
                            ms.delete(t="tb_order_detail_spider",
                                      c=delete_item)
                        sql = "UPDATE tb_order_detail_spider SET itemNo=itemNo-1 " \
                              "WHERE orderNo='{}' " \
                              "AND itemNo>'{}'".format(tb_order_detail_item.orderNo,
                                                       tb_order_detail_item.itemNo)
                        ms.update(sql=sql)
                        pass
            if continue_code:
                continue
            tb_order_detail_item.save(ms)
示例#18
0
    def _get_html(self):
        for shop_id in self._get_shop_id():
            page_num, used_page_nums, total_page, sp_time = self._get_page_num(
                shop_id)
            session = requests.Session()
            while page_num:
                time.sleep(2)
                curl = self._get_curls(shop_id)
                if not curl:
                    time.sleep(30)
                    continue
                start_time = time.time()
                delete(flag='tspi')
                url, params, cookies, headers = self.format_request_params(
                    curl['curl'], page_num)
                while 1:
                    try:
                        proxy = read("proxy")
                        logger.info(proxy)
                        if not proxy:
                            self._set_proxy()
                        proxies = {"https": "https://{}".format(proxy)}
                        r = session.get(url=url,
                                        params=params,
                                        cookies=cookies,
                                        headers=headers,
                                        proxies=proxies,
                                        stream=True,
                                        timeout=30)
                    except Exception as e:
                        logger.error(str(e))
                        self._set_proxy()
                        session = requests.Session()
                        continue
                    else:
                        break
                try:
                    html = r.text.replace("\\", "")
                except requests.exceptions.ChunkedEncodingError:
                    continue
                except requests.exceptions.ConnectionError:
                    continue
                html = re.sub("jsonp\d+\(\"|\"\)", "", html)
                yield html, shop_id, used_page_nums, total_page, page_num
                spent_time = int(time.time() - start_time) + sp_time
                tspi = read(flag="tspi")
                if tspi:
                    tspi['spent_time'] = spent_time
                    MySql.cls_update(db_setting=test_db,
                                     t="tb_search_page_info",
                                     set=tspi,
                                     c={"shop_id": shop_id})
                page_num, used_page_nums, total_page, sp_time = self._get_page_num(
                    shop_id)
            sql = "UPDATE tb_master SET flag='XiaJia',update_date='{}' WHERE shop_id='{}' AND update_date<'{}'".format(
                datetime.date.today(), shop_id, datetime.date.today())
            MySql.cls_update(db_setting=test_db, sql=sql)

        if SEARCH_PAGE_REPORT:
            reports = Reports()
            reports.report([ids for ids in self._get_shop_id()])
            ms = MySql(db_setting=test_db)
            t = TBMasterItem()
            t.save_to_record(ms)
示例#19
0
    def parse(self):
        for html, shop_id, used_page_nums, total_page, page_num in self._get_html(
        ):
            doc = PyQuery(html)
            match = re.search("item\dline1", html)
            if not match:
                MySql.cls_delete(db_setting=test_db,
                                 t='tb_search_curl',
                                 c={"shop_id": shop_id})
                mail("店铺搜索页爬虫出错",
                     shop_id + "错误页码:" + str(page_num) + "\n" + html,
                     MAIL_RECEIVERS)
                continue

            used_page_nums.append(page_num)
            used_page_nums.sort()
            tspi = {  # tb_search_page_info
                "used_page_nums": ",".join([str(x) for x in used_page_nums]),
                "last_date": datetime.date.today()
            }
            write(flag="tspi", value=tspi)

            num = doc(".pagination span.page-info").text()
            try:
                total_page_num = re.search("\d+\/(\d+)", num).group(1)
            except Exception as e:
                logger.error(str(e))
            else:
                if int(total_page_num) != int(total_page):
                    tspi['total_page'] = total_page_num
                    write(flag="tspi", value=tspi)

            items = doc("." + match.group() + " dl.item").items()
            ms = MySql(db_setting=test_db)
            ms_prod = MySql()
            for i in items:
                tb_master_item = TBMasterItem()
                tb_master_item.shop_id = shop_id
                tb_master_item.link_id = i.attr('data-id')
                tb_master_item.description = i.find("dd.detail a").text()
                cprice = float(i.find("div.cprice-area span.c-price").text())
                if i.find("div.sprice-area span.s-price").text():
                    sprice = float(
                        i.find("div.sprice-area span.s-price").text())
                else:
                    sprice = 0
                if i.find("div.sale-area span.sale-num").text():
                    tb_master_item.sales = int(
                        i.find("div.sale-area span.sale-num").text())
                if i.find("dd.rates a span").text():
                    tb_master_item.rates = int(
                        i.find("dd.rates a span").text())
                if sprice:
                    tb_master_item.price_tb = sprice
                    tb_master_item.promotionprice = cprice
                else:
                    tb_master_item.price_tb = cprice
                    tb_master_item.promotionprice = sprice

                print(tb_master_item)
                tb_master_item.save(ms, ms_prod)
            del ms, ms_prod
示例#20
0
 async def parse(self, html):
     ms = MySql()
     self._item['SpiderDate'] = time_now()
     sku_map = re.search('skuMap.*?(\{.*)', html)
     match_xia_jia = re.search("此宝贝已下架", html)
     if match_xia_jia:
         self._item['flag'] = "XiaJia"
     if not sku_map:
         MySql.cls_update(db_setting=TEST_SERVER_DB_TEST,
                          t="tb_master",
                          set={
                              "isUsed": 1,
                              "isMut": 0
                          },
                          c={"link_id": self._item['link_id']})
         res = ms.get_dict(t="prices_tb",
                           c={"link_id": self._item['link_id']})
         if res:
             ms.update(t="prices_tb",
                       set=self._item,
                       c={"link_id": self._item['link_id']})
         else:
             self._item['stockid'] = "no_match"
             self._item['SpiderDate'] = time_ago(minutes=60)
             self._item['need_to_update'] = 1
             ms.insert(t="prices_tb", d=self._item)
         logger.info(self._item)
     else:
         MySql.cls_update(db_setting=TEST_SERVER_DB_TEST,
                          t="tb_master",
                          set={
                              "isUsed": 1,
                              "isMut": 1
                          },
                          c={"link_id": self._item['link_id']})
         doc = PyQuery(html)
         items = doc("li[data-value]").items()
         logger.debug(items)
         attr_map = {}
         if items:
             for item in items:
                 attr_map[item.attr('data-value')] = item.find(
                     'span').text().replace("(", "(").replace(")", ")")
         sku_dict = json.loads(sku_map.group(1))
         count = 1
         for k, v in sku_dict.items():
             sku_result = self._item.copy()
             if self._item['promotionprice'] > 0:
                 discount = round(
                     float(self._item['price_tb']) -
                     float(self._item['promotionprice']), 4)
                 sku_result['promotionprice'] = round(
                     float(v.get('price')) - float(discount), 4)
             else:
                 sku_result['promotionprice'] = 0
             sku_result['skuId'] = v.get('skuId')
             sku_result['price_tb'] = v.get('price')
             sku_result['attribute'] = "-".join([
                 attr_map.get(r) for r in re.sub('^;|;$', "", k).split(";")
             ])
             res = ms.get_dict(t="prices_tb",
                               c={"skuId": sku_result['skuId']})
             if res:
                 ms.update(t="prices_tb",
                           set=sku_result,
                           c={"skuId": sku_result['skuId']})
             else:
                 sku_result['stockid'] = "no_match" + str(count)
                 sku_result['SpiderDate'] = time_ago(minutes=60)
                 sku_result['need_to_update'] = 1
                 ms.insert(t="prices_tb", d=sku_result)
                 count += 1
             logger.info(sku_result)
     del ms
     await self._goto_the_next()
    async def get_page(self):
        await self.page.bringToFront()

        results = MySql.cls_get_dict(t="tb_order_spider",
                                     cn=["detailURL", "orderNo"],
                                     c={
                                         "isDetaildown": 0,
                                         "fromStore": self.fromStore,
                                     },
                                     o=["createTime"],
                                     om="d")
        for result in results:
            ms = MySql()
            tb_order_item = TBOrderItem(**result)
            logger.info(store_trans(self.fromStore))
            logger.info("开始订单 " + result["orderNo"] + " 详情爬取")
            while 1:
                try:
                    await self.page.goto(tb_order_item.detailURL)
                except errors.PageError:
                    return 1
                except errors.TimeoutError:
                    return 1
                else:
                    break
            try:
                await self.page.waitForSelector('#detail-panel', timeout=30000)
            except errors.TimeoutError:
                await self.login.slider(self.page)
                is_logout = re.search(r"login.taobao.com", self.page.url)
                if is_logout:
                    logger.info("登陆状态超时")
                    return 1
                continue
            content = await self.page.content()
            a = re.search(r"var data = JSON.parse\('(.*)'\);",
                          content).group(1)
            # a = a.encode("").decode("unicode_escape")
            b = a.replace('\\\\\\"', '')
            data = b.replace('\\"', '"')
            m = json.loads(data)
            tb_order_item.actualFee = jsonpath(m, '$..actualFee.value')[0]
            tb_order_item.deliverFee = re.findall('\(快递:(\d+\.\d+)', str(m))[0]
            tb_order_item.orderStatus = status_format(
                jsonpath(m, '$..statusInfo.text')[0])
            if tb_order_item.orderStatus == '等待买家付款':
                tb_order_item.isDetaildown = 2
            else:
                tb_order_item.isDetaildown = 1
            tb_order_item.couponPrice = await self.get_coupon(m)

            if jsonpath(m, '$..buyMessage'):
                tb_order_item.buyerComments = jsonpath(m, '$..buyMessage')[0]
            orderNo = m['mainOrder']['id']
            order_info = m['mainOrder']['orderInfo']['lines'][1]['content']
            for i in range(len(order_info)):
                if order_info[i]['value']['name'] == '支付宝交易号:':
                    try:
                        tb_order_item.tradeNo = order_info[i]['value']['value']
                    except KeyError:
                        tb_order_item.tradeNo = None
                elif order_info[i]['value']['name'] == '创建时间:':
                    tb_order_item.createTime = order_info[i]['value']['value']
                # elif order_info[i]['value']['name'] == '发货时间:':
                #     tb_order_item = order_info[i]['value']['value']
                elif order_info[i]['value']['name'] == '付款时间:':
                    tb_order_item.payTime = order_info[i]['value']['value']
            if jsonpath(m, '$..logisticsName'):
                tb_order_item.shippingCompany = jsonpath(
                    m, '$..logisticsName')[0]
                tb_order_item.shippingMethod = jsonpath(m, '$..shipType')[0]
                tb_order_item.shippingNo = jsonpath(m, '$..logisticsNum')[0]
            rec_info = jsonpath(m, '$..tabs..address')[0]
            tb_order_item.receiverName = rec_info.split(",")[0].replace(
                " ", "")
            tb_order_item.receiverPhone = rec_info.split(",")[1]
            tb_order_item.receiverAddress = "".join(rec_info.split(",")[2:])
            tb_order_item.save(ms)
            sub_orders = m['mainOrder']['subOrders']
            for i in range(len(sub_orders)):
                tb_order_detail_item = TBOrderDetailItem(orderNo=orderNo,
                                                         itemNo=i)
                tb_order_detail_item.unitBenefits = 0
                if sub_orders[i]['promotionInfo']:
                    for j in sub_orders[i]['promotionInfo']:
                        for x in j['content']:
                            for k, v in x.items():
                                if k == 'value':
                                    f_prom = re.match("Exercise", v)
                                    p_list = re.findall("-?\d+\.\d+", v)
                                    if p_list and not f_prom:
                                        tb_order_detail_item.unitBenefits += float(
                                            p_list.pop())
                tb_order_detail_item.save(ms)
            del ms
            await my_async_sleep(seconds=15, random_sleep=True)
        verify()
示例#22
0
            ms.insert(t=self._table_name(), d=data)

    def save_to_record(self, ms):
        start = 0
        limit = 1000
        while 1:
            column_name = self.__dict__.keys()
            res = ms.get(t=self._table_name(),
                         cn=list(column_name),
                         l=[str(start * limit), str(limit)])
            if not res:
                break
            d = []
            for r in res:
                a = tuple(str(x) for x in r)
                d.append(str(a))
            sql = "insert into tb_master_record " + str(
                tuple(column_name)).replace("'", "") + " values " + ",".join(d)
            ms.insert(sql=sql)
            start += 1


if __name__ == '__main__':
    from db.my_sql import MySql
    from settings import TEST_SERVER_DB_TEST

    ms = MySql(db_setting=TEST_SERVER_DB_TEST)
    t = TBMasterItem()
    t.save_to_record(ms)
    pass