def save_shop(self, connect, **shop): try: JoomShop.raw_upsert(connect, **shop) return True except: logger.error(traceback.format_exc()) return False
def save_body(self, connect, **body): try: ProductBody.raw_upsert(connect, **body) return True except: logger.error(traceback.format_exc()) return False
def save_product(self, connect, **product): try: JoomPro.raw_upsert(connect, **product) return True except: logger.error(traceback.format_exc()) return False
def product_info(self, **kwargs): # 产品详细信息 pid = kwargs["key"] url = self.product_url % (pid, random_key(4)) headers = self.headers.copy() del headers["content-type"] try: res = requests.get(url, headers=headers, timeout=10) if "unauthorized" in res.content or "payload" not in res.content: joom_token() res = requests.get(url, headers=headers, timeout=10) except: try: res = requests.get(url, headers=headers, timeout=10) except: TaskSchedule.raw_set(31, "item", pid, TaskSchedule.PEND) return False content = json.loads(res.content) if "payload" not in content: logger.error("tag: %s, payload not in content: %s" % (pid, content)) TaskSchedule.raw_set(31, "item", pid, TaskSchedule.PEND) return True pro_body, shop_info, pro_info = self.trans_pro(content) connect = db.connect() if pro_info["reviews_count"] and ( pro_info["reviews_count"] > 99 or (pro_info["reviews_count"] > JoomPro.pro_review_cnt(pid))): TaskSchedule.raw_upsert(connect, pid, "rev", 31) self.save_body(connect, **pro_body) self.save_product(connect, **pro_info) self.save_shop(connect, **shop_info) TaskSchedule.raw_set(31, "item", pid, TaskSchedule.DONE) connect.close() return True
def raw_batch_save_item(self, slice_items): try: if len(slice_items) == 0: return True TaskSchedule.raw_pure_upsert(slice_items, "item", 31) return True except: logger.error(traceback.format_exc()) return False
def restore_cate_items_task(self): logger.debug("saving the cate items ...") with futures.ThreadPoolExecutor(max_workers=32) as executor: future_save_item = { executor.submit(self.raw_batch_save_item, s_item): s_item for s_item in cc.sscan_iter("cate#items", count=300, batch=500) } for future in futures.as_completed(future_save_item): s_item = future_save_item[future] try: result = future.result() except Exception as exc: logger.error("%r generated an exception: %s" % (s_item, exc)) logger.debug("saved ok @@@")
def batch_product_ids(self, **kwargs): pgToken = kwargs.get("value", None) cate = kwargs.get("key") if not pgToken: pgToken = None times = 0 else: pgToken, times = pgToken.split("#") times = int(times) count = 48 # 根据分类获取产品ID列表 data = { "count": count, "filters": [ { "id": "categoryId", "value": { "type": "categories", "items": [ { "id": cate } ] } } ] } if pgToken: data["pageToken"] = pgToken data_str = json.dumps(data) url = self.batch_url % random_key(4) try: res = requests.post(url, data_str, headers=self.headers, timeout=15) except: res = requests.post(url, data_str, headers=self.headers, timeout=15) content = json.loads(res.content) if res.status_code == 200 and "payload" in content and times <= 10000 and "nextPageToken" in content["payload"]: items = content["payload"]["items"] items = [it["id"] for it in items] if items: cc.sadd("cate#items", *items) if len(items) < count: result = TaskSchedule.raw_update(31, "cate", cate, value=content["payload"]["nextPageToken"] + "#" + str(times + 1), status=TaskSchedule.DONE) else: result = TaskSchedule.raw_update(31, "cate", cate, value=content["payload"]["nextPageToken"] + "#" + str(times + 1), status=TaskSchedule.INIT) if not result: logger.error("cate update error with tag: %s" % cate) TaskSchedule.raw_set(31, "cate", cate, TaskSchedule.PEND) del items elif res.status_code == 200 and ( "payload" not in content or times > 10000 or "nextPageToken" not in content["payload"]): TaskSchedule.raw_set(31, "cate", cate, TaskSchedule.DONE) else: logger.error("get cate products error: cate: %s, times: %s" % (cate, times)) logger.error(content) TaskSchedule.raw_set(31, "cate", cate, TaskSchedule.PEND) return True