def fetch_cate_pro(token, cate_id, pgToken=None, times=1): url = 'https://api.joom.com/1.1/search/products?language=en-US¤cy=USD&_=jfs3%s' params = { 'count': 50, 'filters': [{ 'id': 'categoryId', 'value': { 'type': 'categories', 'items': [{ 'id': cate_id }] } }] } if pgToken: params["pageToken"] = pgToken logger.info(u"正在抓取分类%s下第%s页产品" % (cate_id, times)) try: res = requests.post(url % random_key(4), data=json.dumps(params), headers={ "authorization": token, "content-type": 'application/json' }, timeout=15) except Exception, e: res = requests.post(url % random_key(4), data=json.dumps(params), headers={ "authorization": token, "content-type": 'application/json' }, timeout=15)
def fetch_cate(token, p_tag=None, level=1, p_id=0): url = 'https://api.joom.com/1.1/categoriesHierarchy' params = { 'levels': 1, 'parentLevels': 1, 'language': 'en-US', 'currency': 'USD' } if p_tag: params["categoryId"] = p_tag logger.info(u"正在采集id为%s的分类" % p_tag) logger.info(u"参数为%s" % params) res = requests.get(url, params=params, headers={"authorization": token}) if "unauthorized" in res.content: token = get_joom_token() fetch_cate.delay(token, p_tag, level, p_id) return n_level = level + 1 if res.status_code == 200: content = json.loads(res.content) c_infos = content["payload"]["children"] for c_info in c_infos: tag = c_info['id'] name = c_info['name'] is_leaf = 0 if c_info["hasPublicChildren"] else 1 cate = Category.raw_save(tag, name, p_id, is_leaf, level, 31) n_p_id = cate if not is_leaf: fetch_cate.delay(token, p_tag=tag, level=n_level, p_id=n_p_id) else: fetch_cate_pro.delay(token, tag)
def get_current_user(self): try: user_id = self.session["user_id"] with sessionCM() as session: user = User.find_by_id(session, user_id) return user except Exception, e: logger.info(e.message)
def upsert_review(review): logger.info(u"正在插入评论, no为%s" % review["review_no"]) connect = db.connect() try: sql = text('insert ignore into joom_review (review_no,create_time,update_time,pro_no,variation_id,user_no,joom_review.language,origin_text,new_text,order_id,is_anonymous,colors,star,shop_no,photos) VALUES (:review_no,:create_time,:update_time,:pro_no,:variation_id,:user_no,:language,:origin_text,:new_text,:order_id,:is_anonymous,:colors,:star,:shop_no,:photos) ') cursor = connect.execute(sql, **review) cursor.close() except Exception, e: logger.info(traceback.format_exc(e))
def upsert_user(user): logger.info(u"正在插入用户, no为%s" % user["user_no"]) connect = db.connect() try: sql = text( 'insert ignore into joom_user (user_no, full_name, images) values (:user_no, :full_name, :images)') cursor = connect.execute(sql, **user) cursor.close() except Exception, e: logger.info(traceback.format_exc(e))
def upsert_pro(pro): logger.info(u"正在插入产品, no为%s" % pro["pro_no"]) connect = db.connect() try: sql = text( 'insert into joom_pro (joom_pro.name,pro_no,shop_no,category_id,image,rate,msrp,discount,real_price,reviews_count,create_time,update_time,cate_id1,cate_id2,cate_id3,cate_id4,cate_id5,origin_price,r_count_30,r_count_7,r_count_7_14,growth_rate,save_count) values (:name,:pro_no,:shop_no,:category_id,:image,:rate,:msrp,:discount,:real_price,:reviews_count,:create_time,:update_time,"","","","","",0,0,0,0,0,0) on duplicate key update joom_pro.name=:name,category_id=:category_id,rate=:rate,msrp=:msrp,discount=:discount,real_price=:real_price,reviews_count=:reviews_count,update_time=:update_time;' ) cursor = connect.execute(sql, **pro) cursor.close() except Exception, e: logger.info(traceback.format_exc(e))
def upsert_shop(shop): logger.info(u"正在插入店铺, no为%s" % shop["shop_no"]) connect = db.connect() try: sql = text( 'insert into joom_shop (joom_shop.name,shop_no,logo,rate,save_count,create_time,update_time,is_verify,pro_count,reviews_count,r_count_30,r_count_7,r_count_7_14,growth_rate,cate_id) values (:name,:shop_no,:logo,:rate,:save_count,:create_time,:update_time,:is_verify,0,0,0,0,0,0,"") on duplicate key update rate=:rate, save_count=:save_count, create_time=:create_time, update_time=:update_time, is_verify=:is_verify;' ) cursor = connect.execute(sql, **shop) cursor.close() except Exception, e: logger.info(traceback.format_exc(e))
def send_mail(subject, text, to=list(), cc=list(), bcc=list(), name='smtp.qq.com', account='*****@*****.**', password='******' ): """ 发关邮件的后台任务可以单独调用 """ assert type(to) == list assert type(cc) == list assert type(bcc) == list fro = "网站myweb<%s>" % account real_to = to msg = MIMEMultipart() msg["From"] = fro msg["Subject"] = subject msg["To"] = COMMASPACE.join(to) if cc: msg["Cc"] = COMMASPACE.join(cc) real_to += cc if bcc: msg["Bcc"] = COMMASPACE.join(bcc) real_to += bcc msg["Date"] = formatdate(localtime=True) if isinstance(text, dict): text_group = list() for k, v in text.iteritems(): text_group.append("%s:%s" % (str(k), str(v))) text = ";".join(text_group) msg.attach(MIMEText(text, "html", _charset="UTF8")) try: auth_info = {"name": name, "user": account, "passwd": password} smtp = smtplib.SMTP_SSL(auth_info["name"], 465, timeout=20) smtp.login(auth_info["user"], auth_info["passwd"]) smtp.sendmail(fro, real_to, msg.as_string()) smtp.quit() except Exception, e: logger.info("this time is to send content: %s" % text) logger.info(traceback.format_exc(e))
def fetch_review(tag, token, page_token=None): url = "https://api.joom.com/1.1/products/%s/reviews?=all&count=1000&sort=top&language=en-US¤cy=USD&_=jfs3%s" % (tag, random_key(4)) params = { "filter_id": "all", "count": 200, "sort": "top" } if page_token: params["pageToken"] = page_token logger.info(u"正在第%s次抓取产品%s的评论, 参数为%s" % (1, tag, params)) try: res = requests.get(url, params=params, headers={"authorization": token}, timeout=20) except Exception: res = requests.get(url, params=params, headers={"authorization": token}, timeout=20) if "unauthorized" in res.content: token = get_joom_token() fetch_review.delay(tag, token, page_token) return content = res.json() if content.get("payload"): reviews = content["payload"]["items"] review_datas, review_users, review_count = retrieve_review(reviews) # if len(review_datas): # session.execute(JoomReview.__table__.insert(), review_datas) with futures.ThreadPoolExecutor(max_workers=32) as executor: future_to_user = { executor.submit(upsert_review, review=review_data): review_data for review_data in review_datas } for future in futures.as_completed(future_to_user): rev_pro = future_to_user[future] try: rp = future.result() except Exception as exc: logger.error("%s generated an exception: %s" % (rev_pro, exc)) with futures.ThreadPoolExecutor(max_workers=32) as executor: future_to_user = { executor.submit(upsert_user, user=rev_user): rev_user for rev_user in review_users } for future in futures.as_completed(future_to_user): rev_pro = future_to_user[future] try: rp = future.result() except Exception as exc: logger.error("%s generated an exception: %s" % (rev_pro, exc)) if content["payload"].get("nextPageToken") and len(reviews): return fetch_review.delay(tag, token, page_token=content["payload"]["nextPageToken"]) else: logger.info(u"抓取产品%s的评论失败, 参数为%s" % (tag, params))
def aps_callback(req_type, url, data, job_id, other_kwargs=None, retry=1): """ scheduler回调函数 TODO: 添加调用记录 :param req_type: 请求方式 post|get :param url: 请求地址 :param data: 请求数据 :param job_id: 在任务队列中的id :param other_kwargs: 其他 :param retry: 重试次数 :return: """ with sessionCM() as session: sched = Scheduler.find_by_scheduler_id(session, job_id) job = scheduler.get_job(job_id) next_run_time = job.next_run_time.strftime("%Y-%m-%d %H:%M:%S") logger.info("正在执行scheduler回调, 第%s次s请求:" % retry) logger.info(url) logger.info(data) try: if req_type == "post": res = requests.post(url=url, data=data) else: res = requests.get(url=url, params=data) res = res.json() if res["status"]: sched.update( session, **{ "status": ACTIVE if sched.trigger != "date" else END, "next_run_time": next_run_time }) else: sched.update( session, **{ "status": ERROR, "err_mess": res["message"], "next_run_time": next_run_time }) return res except Exception, e: if retry < 3: return aps_callback(req_type, url, data, job_id, other_kwargs=other_kwargs, retry=retry + 1) else: logger.info("POST fail {0}".format(e.message)) sched.update( session, **{ "status": ERROR, "err_mess": "POST fail {0}".format(e.message), "next_run_time": next_run_time }) return {"status": 0, "mess": e.message}
def on_pong(self, data): logger.info("receive a response of my ping")
def on_request(self): logger.info(self.request.arguments) logger.info(self.request.full_url()) logger.info(self.request.headers["X-Real-IP"]) logger.info(self.session.get("user_id")) self.write_error(404)
def str_to_unicode(word): try: return to_unicode(word) except Exception, e: logger.info(e.message) return word.decode("unicode-escape")
def add_my_job(self, trigger, res_type, res_url, job_id, job_name, schedule_args, schedule_type, func_args, user_id=0, remark="", job_store="default"): if trigger not in TRIGGER_LIST: raise ErrorArgumentError job_id = job_id or str(uuid.uuid1()) callback_args = [res_type, res_url, func_args, job_id] try: job = self.add_job_to_scheduler(trigger, job_id, job_name, schedule_args, callback_args, job_store) with sessionCM() as session: _scheduler = Scheduler.find_by_scheduler_id(session, job.id) info = { "request_url": res_url, "trigger": trigger, "action": job_name, "args": json.dumps(func_args), # json 序列化后的参数 "mold": schedule_type, "type": schedule_type, # sys "next_run_time": job.next_run_time.strftime("%Y-%m-%d %H:%M:%S"), "user_id": user_id, "scheduler_id": job.id, "extra": json.dumps(schedule_args), # 不同的trigger的不同参数 "remark": remark, "status": ACTIVE } if not _scheduler: Scheduler.create(session, **info) else: _scheduler.update(session, **info) mess = "job_id为%s的任务添加成功" % job_id logger.info(mess) return {"status": 1, "message": mess} except ConflictingIdError: mess = "job_id为%s的任务已经存在" % job_id logger.error(mess) return {"status": 0, "message": mess} except Exception, e: logger.error(traceback.format_exc(e)) return {"status": 0, "message": "添加任务失败"}
def print_datetime(): print datetime.datetime.now() logger.info(datetime.datetime.now())
res = requests.post(url % random_key(4), data=json.dumps(params), headers={ "authorization": token, "content-type": 'application/json' }, timeout=15) if "unauthorized" in res.content: token = get_joom_token() fetch_cate_pro.delay(token, cate_id, pgToken, times) return content = json.loads(res.content) items = content["payload"]["items"] if len(items) == 0: logger.info(u"分类%s抓取完成!" % cate_id) else: for item in items: logger.info(u'产品id为%s' % item["id"]) fetch_review.delay(item["id"], token) with futures.ThreadPoolExecutor(max_workers=16) as executor: future_to_user = { executor.submit(fetch_pro, tag=item["id"], token=token): item["id"] for item in items } for future in futures.as_completed(future_to_user): rev_pro = future_to_user[future] try: rp = future.result() except Exception as exc:
pro_data["discount"] = parent_info["discount"] pro_data["Description"] = item["description"] pro_data["ProductSKUs"] = list() pro_data["images"] = get_images(item) pro_data["ProductSKUs"] = get_variants(item["variants"]) return pro_data, shop_info, pro_info @celery.task(ignore_result=True) def fetch_pro(tag, token): data_url = 'https://api.joom.com/1.1/products/%s?language=en-US¤cy=USD' % tag try: res = requests.get(data_url, headers={"authorization": token}, timeout=5) except Exception, e: res = requests.get(data_url, headers={"authorization": token}, timeout=5) if "unauthorized" in res.content: token = get_joom_token() fetch_pro(tag, token) return content = json.loads(res.content) pro_data, shop_info, pro_info = trans_pro(content) upsert_shop(shop_info) upsert_pro(pro_info) logger.info(u"产品%s保存成功!" % tag)