예제 #1
0
async def taks_1(browser, delay_order_spider, detail_page_spider, manager_page_spider, from_store, link_id_spider,
                 list_page_spider):
    page_num = 1
    while 1:
        try:
            completed = await list_page_spider.get_page(page_num)
            if completed == 1:
                page_num += 1
            elif completed == 2:
                MySql.cls_update(t="tb_order_spider", set={"isDetaildown": 0},
                                 c={"isDetaildown": 2, "fromStore": from_store})
                MySql.cls_update(t="tb_order_spider", set={"isVerify": 0},
                                 c={"isVerify": 2, "fromStore": from_store})
                page_num = 1
            elif completed == 'exit':
                break
            await my_async_sleep(20, random_sleep=True)
            await link_id_spider.save_link_id()
            await manager_page_spider.do_it()
            await detail_page_spider.get_page()
            exit_loop = await delay_order_spider.get_page()
            if exit_loop == 'exit':
                break
        except Exception as e:
            logger.error(str(e))
            break
    await browser.close()
예제 #2
0
 async def intercept_request(req):
     if re.search(r'https://item.taobao.com/item.htm', req.url):
         await req.continue_()
     elif re.search('item.taobao.com.*?noitem.htm.*?', req.url):
         link_id = re.findall("itemid=(\d+)", req.url)[0]
         MySql.cls_update(db_setting=TEST_SERVER_DB_TEST,
                          t="tb_master",
                          set={
                              "flag": "XiaJia",
                              "isUsed": 1
                          },
                          c={"link_id": link_id})
         await req.abort()
     else:
         await req.abort()
 async def run(cls, login, browser, page, from_store):
     page_num = 1
     list_spider = OrderListPageSpider(login, browser, page, from_store)
     while 1:
         completed = await list_spider.get_page(page_num)
         if completed == 1:
             page_num += 1
         elif completed == 2:
             MySql.cls_update(t="tb_order_spider",
                              set={"isDetaildown": 0},
                              c={
                                  "isDetaildown": 2,
                                  "fromStore": from_store
                              })
             MySql.cls_update(t="tb_order_spider",
                              set={"isVerify": 0},
                              c={
                                  "isVerify": 2,
                                  "fromStore": from_store
                              })
             page_num = 1
         await my_async_sleep(15, random_sleep=True)
예제 #4
0
 async def parse(self, html):
     ms = MySql()
     self._item['SpiderDate'] = time_now()
     sku_map = re.search('skuMap.*?(\{.*)', html)
     match_xia_jia = re.search("此宝贝已下架", html)
     if match_xia_jia:
         self._item['flag'] = "XiaJia"
     if not sku_map:
         MySql.cls_update(db_setting=TEST_SERVER_DB_TEST,
                          t="tb_master",
                          set={
                              "isUsed": 1,
                              "isMut": 0
                          },
                          c={"link_id": self._item['link_id']})
         res = ms.get_dict(t="prices_tb",
                           c={"link_id": self._item['link_id']})
         if res:
             ms.update(t="prices_tb",
                       set=self._item,
                       c={"link_id": self._item['link_id']})
         else:
             self._item['stockid'] = "no_match"
             self._item['SpiderDate'] = time_ago(minutes=60)
             self._item['need_to_update'] = 1
             ms.insert(t="prices_tb", d=self._item)
         logger.info(self._item)
     else:
         MySql.cls_update(db_setting=TEST_SERVER_DB_TEST,
                          t="tb_master",
                          set={
                              "isUsed": 1,
                              "isMut": 1
                          },
                          c={"link_id": self._item['link_id']})
         doc = PyQuery(html)
         items = doc("li[data-value]").items()
         logger.debug(items)
         attr_map = {}
         if items:
             for item in items:
                 attr_map[item.attr('data-value')] = item.find(
                     'span').text().replace("(", "(").replace(")", ")")
         sku_dict = json.loads(sku_map.group(1))
         count = 1
         for k, v in sku_dict.items():
             sku_result = self._item.copy()
             if self._item['promotionprice'] > 0:
                 discount = round(
                     float(self._item['price_tb']) -
                     float(self._item['promotionprice']), 4)
                 sku_result['promotionprice'] = round(
                     float(v.get('price')) - float(discount), 4)
             else:
                 sku_result['promotionprice'] = 0
             sku_result['skuId'] = v.get('skuId')
             sku_result['price_tb'] = v.get('price')
             sku_result['attribute'] = "-".join([
                 attr_map.get(r) for r in re.sub('^;|;$', "", k).split(";")
             ])
             res = ms.get_dict(t="prices_tb",
                               c={"skuId": sku_result['skuId']})
             if res:
                 ms.update(t="prices_tb",
                           set=sku_result,
                           c={"skuId": sku_result['skuId']})
             else:
                 sku_result['stockid'] = "no_match" + str(count)
                 sku_result['SpiderDate'] = time_ago(minutes=60)
                 sku_result['need_to_update'] = 1
                 ms.insert(t="prices_tb", d=sku_result)
                 count += 1
             logger.info(sku_result)
     del ms
     await self._goto_the_next()
예제 #5
0
    def _get_html(self):
        for shop_id in self._get_shop_id():
            page_num, used_page_nums, total_page, sp_time = self._get_page_num(
                shop_id)
            session = requests.Session()
            while page_num:
                time.sleep(2)
                curl = self._get_curls(shop_id)
                if not curl:
                    time.sleep(30)
                    continue
                start_time = time.time()
                delete(flag='tspi')
                url, params, cookies, headers = self.format_request_params(
                    curl['curl'], page_num)
                while 1:
                    try:
                        proxy = read("proxy")
                        logger.info(proxy)
                        if not proxy:
                            self._set_proxy()
                        proxies = {"https": "https://{}".format(proxy)}
                        r = session.get(url=url,
                                        params=params,
                                        cookies=cookies,
                                        headers=headers,
                                        proxies=proxies,
                                        stream=True,
                                        timeout=30)
                    except Exception as e:
                        logger.error(str(e))
                        self._set_proxy()
                        session = requests.Session()
                        continue
                    else:
                        break
                try:
                    html = r.text.replace("\\", "")
                except requests.exceptions.ChunkedEncodingError:
                    continue
                except requests.exceptions.ConnectionError:
                    continue
                html = re.sub("jsonp\d+\(\"|\"\)", "", html)
                yield html, shop_id, used_page_nums, total_page, page_num
                spent_time = int(time.time() - start_time) + sp_time
                tspi = read(flag="tspi")
                if tspi:
                    tspi['spent_time'] = spent_time
                    MySql.cls_update(db_setting=test_db,
                                     t="tb_search_page_info",
                                     set=tspi,
                                     c={"shop_id": shop_id})
                page_num, used_page_nums, total_page, sp_time = self._get_page_num(
                    shop_id)
            sql = "UPDATE tb_master SET flag='XiaJia',update_date='{}' WHERE shop_id='{}' AND update_date<'{}'".format(
                datetime.date.today(), shop_id, datetime.date.today())
            MySql.cls_update(db_setting=test_db, sql=sql)

        if SEARCH_PAGE_REPORT:
            reports = Reports()
            reports.report([ids for ids in self._get_shop_id()])
            ms = MySql(db_setting=test_db)
            t = TBMasterItem()
            t.save_to_record(ms)