def update_taobao_volume(db, shop_id, shop_type, total_items): # fetch current volumes & price logger.info("fetching current volumes %s", shop_id) do_query(db, "select num_id, volume, price from item where shop_id=%s" % shop_id) results = db.store_result() iid_volumes = {} for row in results.fetch_row(maxrows=0): iid_volumes[row[0]] = row[1] # update taobao volume, not tmall if FLAGS.updatevolume and shop_type == 1: db.autocommit(False) db.query("set autocommit=0;") # update volume logger.info("updating item volume %s", shop_id) for item in total_items: new_value = item['sales_amount'] old_value = iid_volumes.get(item['iid'], 0) or 0 diff_v = abs(new_value - old_value) if not iid_volumes.has_key(item['iid']): continue # 10% or 20 changed, update if new_value > 0 and new_value != old_value and ( old_value == 0 or diff_v > 20 or diff_v * 1.0 / old_value > 0.1): logger.debug("updating item %s %s %s -> %s", shop_id, item['iid'], old_value, new_value) do_query(db, "update item set volume=%s where num_id=%s and shop_id=%s" % (new_value, item['iid'], shop_id)) Statsd.increment('guang.crawl.volume_update_onlist') db.commit() # TODO: update tmall total volumes if FLAGS.updatevolume and shop_type == 2: pass
def convert_taobaoke_widget(items, fn_join_iids=lambda x:','.join(x), batch_size=40, calllimit=60, expire=600, outer_code='jcn', appkey=TAOBAOKE_APPKEY, appsec=TAOBAOKE_APPSECRET): ts = int(time.time()*1000) msg = appsec + 'app_key' + str(appkey) + "timestamp" + str(ts) + appsec sign = hmac.HMAC(appsec, msg).hexdigest().upper() headers = {'User-Agent' : DEFAULT_UA, 'Referer' : "http://www.j.cn/"} for chunk in waitlimit(calllimit, 60.0, chunks(items, batch_size)): # calllimit for minutes params = {'app_key' : appkey, '_t_sys' : 'args=4', 'method' : 'taobao.taobaoke.widget.items.convert', 'sign' : sign, 'timestamp' : ts, 'fields' : "num_iid,nick,price,click_url,commission,commission_rate,commission_num,commission_volume,shop_click_url,seller_credit_score", 'callback' : 'TOP.io.jsonpCbs.t%s' % md5( str(random.random()) ).hexdigest()[:13], 'partner_id' : 'top-sdk-js-20120801', } params['num_iids'] = fn_join_iids(chunk) if outer_code: params['outer_code'] = outer_code url = "http://gw.api.taobao.com/widget/rest?%s" % urllib.urlencode(params) results = download(url, headers=headers) if results: Statsd.increment('guang.taobaoapi.widget_succ') else: Statsd.increment('guang.taobaoapi.widget_err') #logger.debug('Calling %s(%s) -> %s', request.method_name, request.api_params, results) yield (chunk, results)
def crawl_one_shop(shop, failed): try: is_commit = shop['is_commit'] shop_id = shop['shop'][0] shop_url = shop['shop'][1] shop_type = shop['shop'][4] shop_nick = shop['shop'][5] tb = TaobaoListHtml(shop_id, shop_url) tb.crawl() logger.debug("crawl result %s count %s total %s", tb.id, tb.count, len(tb.total_items)) if is_commit: batch_size=100 total_items = tb.total_items db = get_rawdb_conn() update_shop_items(batch_size, db, shop_id, total_items) update_taobao_volume(db, shop_id, shop_type, total_items) db.close() Statsd.increment('guang.crawl.shop_list_succ') except ShopOfflineException: #double check shop status by taobao api shopinfo = get_taobao_shops(get_rand_top(), shop_nick) if shopinfo.get('error', 0) == 560 and is_commit: db = get_rawdb_conn() do_query(db, "update shop set status=2 where id=%s" % shop_id) db.commit() db.close() except: Statsd.increment('guang.crawl.shop_list_failed') logger.error("crawl shop failed %s %s", shop_id, traceback.format_exc(), extra={'tags':['crawlShopException',]}) failed.append({'shopid':shop_id, 'err':traceback.format_exc()})
def post_one(db, user, select_sql, update_succ_sql, update_fail_sql, table_prefix): now = datetime.datetime.now() next = datetime.datetime(2020, 1, 1) total = 0 succ = 0 failed = 0 skip = 0 logger.debug("querying %s", select_sql) results = db.execute(select_sql) logger.debug("processing post for user %s, total %s, sql %s", user, results.rowcount, select_sql) for item in results: if FLAGS.postinterval: time.sleep(FLAGS.postinterval) total += 1 logger.debug("processing post %s/%s/%s/%s for user %s", succ, skip, total, results.rowcount, user) post_id = item[0] filename = item[2] post_ts = item[3] qqid = item[4] cookie_file = item[5] sid = item[6] #grplevel = item[7] content = preprocess_content(item[1], sid, post_ts, table_prefix, post_id) # reselect cookie cookie_result = list(db.execute("select cookies from wb_qq_account where qqid=%s" % qqid)) if cookie_result and cookie_result[0][0] != cookie_file: cookie_file = cookie_result[0][0] if post_ts <= now and cookie_file: logger.info("Preparing posting %s/%s %s qq %s sid %s %s %s @ %s", total, results.rowcount, post_id, qqid, sid, content.encode('utf8'), filename.encode('utf8'), post_ts) result = post_shuoshuo_string(cookie_file, filename, content, sid=sid, post_id=post_id) if not FLAGS.dryrun: if result: succ += 1 db.execute(update_succ_sql % post_id) Statsd.increment('guang.qzonepost.succ') elif FLAGS.commitfail: failed += 1 db.execute(update_fail_sql % post_id) Statsd.increment('guang.qzonepost.fail') log_paperboy("post timeout xks %s post_id %s" % (sid, post_id)) else: if FLAGS.timer: logger.info("Preparing posting timer %s/%s %s qq %s sid %s %s %s @ %s", total, results.rowcount, post_id, qqid, sid, content.encode('utf8'), filename.encode('utf8'), post_ts) result = post_shuoshuo_string(cookie_file, filename, content, sid=sid, schedule_ts=int(time.mktime(post_ts.timetuple())), post_id=post_id) if not FLAGS.dryrun: if result: succ += 1 db.execute(update_succ_sql % post_id) Statsd.increment('guang.qzonepost.succ') else: skip += 1 next = min(post_ts, next) Statsd.increment('guang.qzonepost.timerfail') else: skip += 1 next = min(post_ts, next) logger.debug("Skiping post %s %s, scheduled @ %s", content.encode('utf8'), filename.encode('utf8'), post_ts) if total > 0: logger.info("Batch result total %s skip %s succ %s failed %s next schedule %s", total, skip, succ, failed, next)
def crawl_one_shop(shop, failed): try: is_commit = shop['is_commit'] shop_id = shop['shop'][0] shop_url = shop['shop'][1] shop_type = shop['shop'][4] shop_nick = shop['shop'][5].encode('utf-8') tb = TaobaoListHtml(shop_id, shop_url) tb.crawl() logger.debug("crawl result %s count %s total %s", tb.id, tb.count, len(tb.total_items)) if is_commit: batch_size = 100 total_items = tb.total_items db = get_rawdb_conn() update_shop_items(batch_size, db, shop_id, total_items) update_taobao_volume(db, shop_id, shop_type, total_items) db.close() Statsd.increment('guang.crawl.shop_list_succ') except ShopOfflineException: #double check shop status by taobao api shopinfo = get_taobao_shops(get_rand_top(), [shop_nick]) if not shopinfo and is_commit: """ db = get_rawdb_conn() do_query(db, "update shop set status=2 where id=%s" % shop_id) db.commit() db.close() """ logger.warning("Shop %s: %s not is taobaoke", shop_id, shop_url) else: logger.error("Shop %s: %s url is error!", shop_id, shop_url) except: Statsd.increment('guang.crawl.shop_list_failed') logger.error("crawl shop failed %s %s", shop_id, traceback.format_exc(), extra={'tags': [ 'crawlShopException', ]}) failed.append({'shopid': shop_id, 'err': traceback.format_exc()})
def convert_taobaoke_widget(items, fn_join_iids=lambda x: ','.join(x), batch_size=40, calllimit=60, expire=600, outer_code='jcn', appkey=TAOBAOKE_APPKEY, appsec=TAOBAOKE_APPSECRET): ts = int(time.time() * 1000) msg = appsec + 'app_key' + str(appkey) + "timestamp" + str(ts) + appsec sign = hmac.HMAC(appsec, msg).hexdigest().upper() headers = {'User-Agent': DEFAULT_UA, 'Referer': "http://www.j.cn/"} for chunk in waitlimit(calllimit, 60.0, chunks(items, batch_size)): # calllimit for minutes params = { 'app_key': appkey, '_t_sys': 'args=4', 'method': 'taobao.taobaoke.widget.items.convert', 'sign': sign, 'timestamp': ts, 'fields': "num_iid,nick,price,click_url,commission,commission_rate,commission_num,commission_volume,shop_click_url,seller_credit_score", 'callback': 'TOP.io.jsonpCbs.t%s' % md5(str(random.random())).hexdigest()[:13], 'partner_id': 'top-sdk-js-20120801', } params['num_iids'] = fn_join_iids(chunk) if outer_code: params['outer_code'] = outer_code url = "http://gw.api.taobao.com/widget/rest?%s" % urllib.urlencode( params) results = download(url, headers=headers) if results: Statsd.increment('guang.taobaoapi.widget_succ') else: Statsd.increment('guang.taobaoapi.widget_err') #logger.debug('Calling %s(%s) -> %s', request.method_name, request.api_params, results) yield (chunk, results)
def update_taobao_volume(db, shop_id, shop_type, total_items): # fetch current volumes & price logger.info("fetching current volumes %s", shop_id) do_query( db, "select num_id, volume, price from item where shop_id=%s" % shop_id) results = db.store_result() iid_volumes = {} for row in results.fetch_row(maxrows=0): iid_volumes[row[0]] = row[1] # update taobao volume, not tmall if FLAGS.updatevolume and shop_type == 1: db.autocommit(False) db.query("set autocommit=0;") # update volume logger.info("updating item volume %s", shop_id) for item in total_items: new_value = item['sales_amount'] old_value = iid_volumes.get(item['iid'], 0) or 0 diff_v = abs(new_value - old_value) if not iid_volumes.has_key(item['iid']): continue # 10% or 20 changed, update if new_value > 0 and new_value != old_value and ( old_value == 0 or diff_v > 20 or diff_v * 1.0 / old_value > 0.1): logger.debug("updating item %s %s %s -> %s", shop_id, item['iid'], old_value, new_value) do_query( db, "update item set volume=%s where num_id=%s and shop_id=%s" % (new_value, item['iid'], shop_id)) Statsd.increment('guang.crawl.volume_update_onlist') db.commit() # TODO: update tmall total volumes if FLAGS.updatevolume and shop_type == 2: pass
def try_execute(top, request, expire=600.0): current = time.time() interval = 60.0 # wait for 1 min ban_retry_count = 0 http_retry_count = 0 http_max_retry = 3 while True: try: if FLAGS.debug_topapi: import pdb pdb.set_trace() result = top.execute(request) Statsd.increment('guang.taobaoapi.%s.succ' % request.method_name.replace('.', '_')) logger.debug("calling %s(%s) --> %s", request.method_name, request.api_params, result) return result except requests.exceptions.ConnectionError, e1: logger.warn("Call api http failed %s", traceback.format_exc()) Statsd.increment('guang.taobaoapi.%s.conn_err' % request.method_name.replace('.', '_')) http_retry_count += 1 if http_retry_count > http_max_retry: return None else: time.sleep(interval) except TOPException, e: logger.warn("Call api top failed %s", traceback.format_exc()) Statsd.increment('guang.taobaoapi.%s.api_err' % request.method_name.replace('.', '_')) if e.code in [4, 5, 6, 7, 8]: # This ban will last for 71 more seconds m = R_ERROR_MSG.match(e.message) # e.args[0] if m: try: interval = int(m.group(1)) + 10.0 except: interval = 60.0 if ban_retry_count > 0: interval += 60.0 * ban_retry_count ban_retry_count += 1 logger.info("Waiting and try after %s", interval) time.sleep(interval) if time.time() - current > expire: logger.error("call %s timeout %s" % (request.method_name, time.time() - current)) return None elif e.code == 560: # 查询不到对应的用户信息 (code=560) return {'error': 560} else: return None
def try_execute(top, request, expire=600.0): current = time.time() interval = 60.0 # wait for 1 min ban_retry_count = 0 http_retry_count = 0 http_max_retry = 3 while True: try: if FLAGS.debug_topapi: import pdb; pdb.set_trace() result = top.execute(request) Statsd.increment('guang.taobaoapi.%s.succ' % request.method_name.replace('.', '_')) logger.debug("calling %s(%s) --> %s", request.method_name, request.api_params, result) return result except requests.exceptions.ConnectionError, e1: logger.warn("Call api http failed %s", traceback.format_exc()) Statsd.increment('guang.taobaoapi.%s.conn_err' % request.method_name.replace('.', '_')) http_retry_count += 1 if http_retry_count > http_max_retry: return None else: time.sleep(interval) except TOPException, e: logger.warn("Call api top failed %s", traceback.format_exc()) Statsd.increment('guang.taobaoapi.%s.api_err' % request.method_name.replace('.', '_')) if e.code in [4, 5, 6, 7, 8]: # This ban will last for 71 more seconds m = R_ERROR_MSG.match(e.message) # e.args[0] if m: try: interval = int(m.group(1)) + 10.0 except: interval = 60.0 if ban_retry_count > 0: interval += 60.0*ban_retry_count ban_retry_count += 1 logger.info("Waiting and try after %s", interval) time.sleep(interval) if time.time() - current > expire: logger.error("call %s timeout %s" % (request.method_name, time.time()-current)) return None elif e.code == 560: # 查询不到对应的用户信息 (code=560) return {'error':560} else: return None
def crawl_item2(kwargs): item = kwargs['item'] is_commit = kwargs['is_commit'] is_success = False item_id = item[0] num_id = item[2] crawl_result = ((item_id, (0,0,0,0,0,0.0,0)),) tb = TaobaoHtml(item_id, num_id, max_comments=kwargs['max_comments']) db = None if is_commit: db = get_db_engine() try: logger.info("progress %s/%s id %s iid %s", kwargs['i'], kwargs['total'], item_id, num_id) tb.crawl() if tb.is_offline and is_commit: db.execute("update item set status=2, modified=now() where id=%s" % item_id) if tb.detailDiv and not tb.is_offline: tb.crawl_price() if is_commit: # check old price and volume pv = list(db.execute("select price, volume from item where id=%s", item_id)) price = pv[0][0] volume = pv[0][1] if tb.price != price and tb.price > 0.001: is_price_update = True else: is_price_update = False if tb.volume > 0 and tb.volume != volume: is_volume_update = True else: is_volume_update = False if is_price_update: db.execute("insert into price_update_track (item_id,time) values (%s,now()) on duplicate key update time=now()" % item_id) if is_volume_update: db.execute("update item set price=%s, volume=%s where id=%s", tb.price, tb.volume, item_id) else: db.execute("update item set price=%s where id=%s", tb.price, item_id) elif is_volume_update: db.execute("update item set volume=%s where id=%s", tb.volume, item_id) if is_price_update: Statsd.increment("taobao.crawl.price_update") if is_volume_update: Statsd.increment("taobao.crawl.volume_update") if FLAGS.update_main: tb.crawl_desc() if len(tb.thumbImages) > 0 and is_commit and FLAGS.commit_html: db.execute("delete from crawl_html where item_id=%s" % item_id) db.execute("insert into crawl_html (item_id,desc_url,promo_url,html,desc_content,promo_content,result,reason) values (%s, %s, %s, %s, %s, %s, %s, %s)", item_id, tb.descUrl, tb.promoteUrl, tb.data.decode('gb18030').encode('utf8'), tb.descContent.decode('gb18030').encode('utf8'), tb.promoteContent.decode('gb18030').encode('utf8'), 1, "") db.execute("update item set crawl_status=1 where id=%s" % item_id) Statsd.increment("taobao.crawl.html_update") ############### processing comments ########### if FLAGS.update_comments: rediscli = get_redis(FLAGS.redishost, FLAGS.redisport) key = "guang:rate:%s" % item_id l = rediscli.llen(key) tb.crawl_rate() logger.info("replace comments %s %s -> %s", item_id, l, len(tb.comments)) #rediscli.lrange(key, 0, l) rediscli.delete(key) for c in tb.comments: rediscli.rpush(key, c.SerializeToString()) # if limit size #p = rediscli.pipeline() #p.rpush(key, c.SerializeToString()) #p.ltrim(0, 99) #p.execute() Statsd.increment("taobao.crawl.comments_update") Statsd.update_stats("taobao.crawl.comments_update_total", len(tb.comments)) is_success = True crawl_result = ((item_id, (len(tb.data),len(tb.promoteContent),len(tb.descContent),len(tb.thumbImages),len(tb.buyButton),tb.price,len(tb.comments))),) else: logger.warn("crawl %s failed, no detail content or is_offline=%s", item_id, tb.is_offline) crawl_result = ((item_id, (len(tb.data),0,0,0,0,0.0,0)),) except: logger.error("crawling %s unknown exception %s", item_id, traceback.format_exc(), extra={'tags':['crawlItemException',]}) logger.info("crawling %s result %s - %s", item_id, is_success, crawl_result) if is_success: Statsd.increment("taobao.crawl.itemhtml.succ") else: Statsd.increment("taobao.crawl.itemhtml.failed") return crawl_result
def update_vip_shop(shop_id, db=None): if not db: db = get_db_engine() limitsql = "" if FLAGS.limit: limitsql += " limit " + str(FLAGS.limit) if shop_id: shop_str = " shop.id = %s and " % shop_id else: shop_str = " shop.id in (%s) and " % ','.join(map(str, FLAGS.vipshopids)) if FLAGS.interval > 0: from_date = datetime.datetime.strftime(datetime.datetime.now() - datetime.timedelta(FLAGS.interval), "%Y-%m-%d %H:%M:%S") shop_str += " (created > '%s' or modified > '%s') and " % (from_date, from_date) sql = "select item.id,item.num_id,shop.type,item.detail_url,item_re.detail_url from shop,item left join item_re on item.id=item_re.item_id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str results = db.connect().execute(sql + limitsql) total = results.rowcount logger.debug("Processing %s result %s", sql, total) if total == 0: logger.info("nothing to do with shop %s", shop_str) return pos = 0 converted = 0 SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$") for input, outputstr in convert_taobaoke_widget(list(filter_tbk_items(results)), fn_join_iids=join_iids, calllimit=60, outer_code=None, appkey='21315963', appsec='549d623e612832df7720101f83f951b9'): if not outputstr: logger.debug("Converted failed %s null %s progress %s/%s/%s -> in %s" % (input, shop_id, converted, pos, total, len(input))) continue output = extract_json_from_jsonp(outputstr) pos += len(input) if not output: logger.debug("Converted failed empty %s %s progress %s/%s/%s -> in %s" % (input, shop_id, converted, pos, total, len(input))) continue if output['total_results'] == 0 or not output['taobaoke_items']: logger.debug("No output %s %s %s/%s/%s", input, shop_id, converted, pos, total) continue succ_len = len(output['taobaoke_items']['taobaoke_item']) logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" % (shop_id, converted, pos, total, len(input), output['total_results'], succ_len)) converted += succ_len Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len) try: numid2id = dict([(int(num_id), id) for id, num_id, shop_type, jn_url, re_url in input]) for result in output['taobaoke_items']['taobaoke_item']: isql = "" try: num_iid = result['num_iid'] click_url = result['click_url'] + "&u=re_UCTRAC_CLK_&unid=re_UCTRAC_CLK_" # conver spm to xtao if click_url.find('spm=') > 0: click_url = SPMRE.subn(r'\g<1>2014.21315963.1.0\g<3>', click_url)[0] else: click_url += '&spm=2014.21315963.1.0' id = numid2id[num_iid] isql = "insert into item_re (item_id, detail_url) values (%s, '%s') on duplicate key update detail_url='%s'" % (id, click_url, click_url) logger.debug("process %s %s/%s -> %s", shop_id, pos, total, isql) if not FLAGS.dryrun: db.execute(isql.replace('%', '%%')) except KeyboardInterrupt: raise except Exception, e: logger.debug("in %s out %s" % (numid2id, result)) logger.warn("convert failed %s %s" % (isql, traceback.format_exc())) except KeyboardInterrupt: raise except: logger.warn("process failed %s %s reason %s" % (input, output, traceback.format_exc())) logger.info("Convert result %s - %s", converted, total) # retry sql results = db.connect().execute(sql + limitsql) for row in filter_tbk_items(results): id, num_id, shop_type, jn_url, re_url = row if not re_url: sql = "insert into item_re (item_id, detail_url) values (%s, '%s')" % (id, 'http://item.taobao.com/item.htm?id=%s&spm=2014.21315963.1.0' % num_id) db.execute(sql)
def update_shop(shop_id, db): if not db: db = get_db_engine() tbk = list(db.execute("select * from tbk where shop_id=%s" % shop_id)) if tbk: tbk_pid = str(tbk[0][1]) else: tbk_pid = FLAGS.pid limitsql = "" if FLAGS.limit: limitsql += " limit " + str(FLAGS.limit) if shop_id: shop_str = " shop.id = %s and " % shop_id else: shop_str = "" if FLAGS.interval > 0: from_date = datetime.datetime.strftime(datetime.datetime.now() - datetime.timedelta(FLAGS.interval), "%Y-%m-%d %H:%M:%S") shop_str += " (created > '%s' or modified > '%s') and " % (from_date, from_date) if not FLAGS.force: sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id and item.detail_url not like '%%%%s.click.taobao.com%%%%'" % shop_str results = db.connect().execute(sql + limitsql) else: sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str results = db.connect().execute(sql + limitsql) total = results.rowcount if total == 0: logger.info("nothing to do with shop %s", shop_id) return pos = 0 converted = 0 SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$") for input, outputstr in convert_taobaoke_widget(list(filter_retry_items(results)), fn_join_iids=join_iids, calllimit=60, outer_code=None): if not outputstr: logger.debug("Converted failed null %s progress %s/%s/%s -> in %s" % (shop_id, converted, pos, total, len(input))) continue output = extract_json_from_jsonp(outputstr) pos += len(input) if not output: logger.debug("Converted failed empty %s progress %s/%s/%s -> in %s" % (shop_id, converted, pos, total, len(input))) continue if output['total_results'] == 0 or not output['taobaoke_items']: logger.debug("No output %s %s/%s/%s", shop_id, converted, pos, total) for row in input: if not FLAGS.dryrun: db.execute("insert into tbk_item_convert(item_id, failed_count, last_time) values(%s, 1, now()) on duplicate key update failed_count=failed_count+1, last_time=now()" % row[0]) continue succ_len = len(output['taobaoke_items']['taobaoke_item']) logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" % (shop_id, converted, pos, total, len(input), output['total_results'], succ_len)) converted += succ_len Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len) try: numid2id = dict([(int(num_id), id) for id, num_id, failed_count, last_time in input]) for result in output['taobaoke_items']['taobaoke_item']: sql = "" try: num_iid = result['num_iid'] click_url = result['click_url'] + "&u=jn_UCTRAC_CLK_&unid=jn_UCTRAC_CLK_" # conver spm to xtao if click_url.find('spm=') > 0: click_url = SPMRE.subn(r'\g<1>2014.12669715.1.0\g<3>', click_url)[0] else: click_url += '&spm=2014.12669715.1.0' id = numid2id[num_iid] sql = "update item set detail_url='%s' where id=%s" % (click_url, id) logger.debug("process %s %s/%s -> %s", shop_id, pos, total, sql) if not FLAGS.dryrun: db.execute(sql.replace('%', '%%')) db.execute("delete from tbk_item_convert where item_id=%s" % id) except KeyboardInterrupt: raise except Exception, e: logger.debug("in %s out %s" % (numid2id, result)) logger.warn("convert failed %s %s" % (sql, traceback.format_exc())) except KeyboardInterrupt: raise except: logger.warn("process failed %s %s reason %s" % (input, output, traceback.format_exc())) logger.info("Convert result %s - %s", converted, total)
def crawl_item2(kwargs): #signal.signal(signal.SIGINT, signal.SIG_IGN) item = kwargs['item'] is_commit = kwargs['is_commit'] crawl_path = kwargs['crawl_path'] server_path = kwargs['server_path'] org_server_path = kwargs['org_server_path'] is_remove = kwargs['is_remove'] item_id = item[0] num_id = item[1] is_success = False crawl_result = ((item_id, {'suc1': 0, 'count1': 0, 'suc': 0, 'count': 0}),) try: conn = get_db_engine(**kwargs).connect() try: items = conn.execute("select html, desc_content from crawl_html where crawl_html.item_id=%s;" % item_id) result = list(items) if len(result) == 1: html = result[0][0] desc_content = result[0][1] html_obj = parse_html(html) thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src") if len(thumbImages) == 0: thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")] # taobao @src to @data-src if not len(thumbImages): thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src") if len(thumbImages) == 0: logger.error("crawl item %s %s not found thumb images html size %s", item_id, num_id, len(html), extra={'tags':['crawl_thumb_empty',]}) return crawl_result r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M|re.S) tr = re.compile("(.*)_\d+x\d+\.jpg$") tr_new = re.compile("(.+\.(jpg|png|gif))[^$]*.jpg$") desc_thumbs = desc_table_thumbs = lazy_desc_thumbs = [] if desc_content: desc_html = r.subn(r'\2', desc_content)[0] desc_html_obj = parse_html(desc_html) if desc_html_obj is not None: desc_table_thumbs = desc_html_obj.xpath("//table/@background") desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img[not(@data-ks-lazyload)]/@src") lazy_desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img/@data-ks-lazyload") else: logger.warn("crawl item %s %s desc content is empty!", item_id, num_id, extra={'tags':['crawl_nodesc',]}) images = [] pos = 1 for url in thumbImages: ori_url = None if tr.match(url): ori_url = tr.sub(r'\1', url) else: if tr_new.match(url): ori_url = tr_new.sub(r'\1', url) else: logger.error("crawl item %s %s thumb image urls can not be parsed!", item_id, num_id, extra={'tags':['crawl_exception',]}) images.append((ori_url, pos, 1)) pos += 1 for url in desc_table_thumbs: images.append((url, pos, 2)) pos += 1 for url in desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 2)) pos += 1 for url in lazy_desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 3)) pos += 1 logger.debug("crawling %s %s %s", item_id, num_id, images) item_crawler = ItemCrawler(item_id, num_id, crawl_path, server_path, org_server_path, kwargs['statshost'], kwargs['statsport']) item_crawler.crawl(images, ((710,10000),), is_commit, conn, is_remove) is_success = item_crawler.success crawl_result = ((item_id, item_crawler.summary),) except Exception, e: logger.error("crawl item %s %s got exception %s", item_id, num_id, traceback.format_exc(), extra={'tags':['crawl_exception',]}) finally: conn.close() Statsd.update_stats("guang.crawl.downimgcount", crawl_result[0][1]['suc1'] + crawl_result[0][1]['suc'], host = kwargs['statshost'], port = kwargs['statsport']) if is_success: logger.info("crawl item %s %s success %s", item_id, num_id, crawl_result) Statsd.increment('guang.crawl.itemimg.succ', host = kwargs['statshost'], port = kwargs['statsport']) else: logger.warn("crawl item %s %s failed %s", item_id, num_id, crawl_result, extra={'tags':['crawl_failed',]}) Statsd.increment('guang.crawl.itemimg.failed', host = kwargs['statshost'], port = kwargs['statsport'])
def update_shop(shop_id, db): if not db: db = get_db_engine() tbk = list(db.execute("select * from tbk where shop_id=%s" % shop_id)) if tbk: tbk_pid = str(tbk[0][1]) else: tbk_pid = FLAGS.pid limitsql = "" if FLAGS.limit: limitsql += " limit " + str(FLAGS.limit) if shop_id: shop_str = " shop.id = %s and " % shop_id else: shop_str = "" if FLAGS.interval > 0: from_date = datetime.datetime.strftime( datetime.datetime.now() - datetime.timedelta(FLAGS.interval), "%Y-%m-%d %H:%M:%S") shop_str += " (created > '%s' or modified > '%s') and " % (from_date, from_date) if not FLAGS.force: sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id and item.detail_url not like '%%%%s.click.taobao.com%%%%'" % shop_str results = db.connect().execute(sql + limitsql) else: sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str results = db.connect().execute(sql + limitsql) total = results.rowcount if total == 0: logger.info("nothing to do with shop %s", shop_id) return pos = 0 converted = 0 SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$") for input, outputstr in convert_taobaoke_widget(list( filter_retry_items(results)), fn_join_iids=join_iids, calllimit=60, outer_code=None): if not outputstr: logger.debug( "Converted failed null %s progress %s/%s/%s -> in %s" % (shop_id, converted, pos, total, len(input))) continue output = extract_json_from_jsonp(outputstr) pos += len(input) if not output: logger.debug( "Converted failed empty %s progress %s/%s/%s -> in %s" % (shop_id, converted, pos, total, len(input))) continue if output['total_results'] == 0 or not output['taobaoke_items']: logger.debug("No output %s %s/%s/%s", shop_id, converted, pos, total) for row in input: if not FLAGS.dryrun: db.execute( "insert into tbk_item_convert(item_id, failed_count, last_time) values(%s, 1, now()) on duplicate key update failed_count=failed_count+1, last_time=now()" % row[0]) continue succ_len = len(output['taobaoke_items']['taobaoke_item']) logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" % (shop_id, converted, pos, total, len(input), output['total_results'], succ_len)) converted += succ_len Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len) try: numid2id = dict([(int(num_id), id) for id, num_id, failed_count, last_time in input]) for result in output['taobaoke_items']['taobaoke_item']: sql = "" try: num_iid = result['num_iid'] click_url = result[ 'click_url'] + "&u=jn_UCTRAC_CLK_&unid=jn_UCTRAC_CLK_" # conver spm to xtao if click_url.find('spm=') > 0: click_url = SPMRE.subn(r'\g<1>2014.12669715.1.0\g<3>', click_url)[0] else: click_url += '&spm=2014.12669715.1.0' id = numid2id[num_iid] sql = "update item set detail_url='%s' where id=%s" % ( click_url, id) logger.debug("process %s %s/%s -> %s", shop_id, pos, total, sql) if not FLAGS.dryrun: db.execute(sql.replace('%', '%%')) db.execute( "delete from tbk_item_convert where item_id=%s" % id) except KeyboardInterrupt: raise except Exception, e: logger.debug("in %s out %s" % (numid2id, result)) logger.warn("convert failed %s %s" % (sql, traceback.format_exc())) except KeyboardInterrupt: raise except: logger.warn("process failed %s %s reason %s" % (input, output, traceback.format_exc())) logger.info("Convert result %s - %s", converted, total)
def update_vip_shop(shop_id, db=None): if not db: db = get_db_engine() limitsql = "" if FLAGS.limit: limitsql += " limit " + str(FLAGS.limit) if shop_id: shop_str = " shop.id = %s and " % shop_id else: shop_str = " shop.id in (%s) and " % ','.join( map(str, FLAGS.vipshopids)) if FLAGS.interval > 0: from_date = datetime.datetime.strftime( datetime.datetime.now() - datetime.timedelta(FLAGS.interval), "%Y-%m-%d %H:%M:%S") shop_str += " (created > '%s' or modified > '%s') and " % (from_date, from_date) sql = "select item.id,item.num_id,shop.type,item.detail_url,item_re.detail_url from shop,item left join item_re on item.id=item_re.item_id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str results = db.connect().execute(sql + limitsql) total = results.rowcount logger.debug("Processing %s result %s", sql, total) if total == 0: logger.info("nothing to do with shop %s", shop_str) return pos = 0 converted = 0 SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$") for input, outputstr in convert_taobaoke_widget( list(filter_tbk_items(results)), fn_join_iids=join_iids, calllimit=60, outer_code=None, appkey='21315963', appsec='549d623e612832df7720101f83f951b9'): if not outputstr: logger.debug( "Converted failed %s null %s progress %s/%s/%s -> in %s" % (input, shop_id, converted, pos, total, len(input))) continue output = extract_json_from_jsonp(outputstr) pos += len(input) if not output: logger.debug( "Converted failed empty %s %s progress %s/%s/%s -> in %s" % (input, shop_id, converted, pos, total, len(input))) continue if output['total_results'] == 0 or not output['taobaoke_items']: logger.debug("No output %s %s %s/%s/%s", input, shop_id, converted, pos, total) continue succ_len = len(output['taobaoke_items']['taobaoke_item']) logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" % (shop_id, converted, pos, total, len(input), output['total_results'], succ_len)) converted += succ_len Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len) try: numid2id = dict([(int(num_id), id) for id, num_id, shop_type, jn_url, re_url in input ]) for result in output['taobaoke_items']['taobaoke_item']: isql = "" try: num_iid = result['num_iid'] click_url = result[ 'click_url'] + "&u=re_UCTRAC_CLK_&unid=re_UCTRAC_CLK_" # conver spm to xtao if click_url.find('spm=') > 0: click_url = SPMRE.subn(r'\g<1>2014.21315963.1.0\g<3>', click_url)[0] else: click_url += '&spm=2014.21315963.1.0' id = numid2id[num_iid] isql = "insert into item_re (item_id, detail_url) values (%s, '%s') on duplicate key update detail_url='%s'" % ( id, click_url, click_url) logger.debug("process %s %s/%s -> %s", shop_id, pos, total, isql) if not FLAGS.dryrun: db.execute(isql.replace('%', '%%')) except KeyboardInterrupt: raise except Exception, e: logger.debug("in %s out %s" % (numid2id, result)) logger.warn("convert failed %s %s" % (isql, traceback.format_exc())) except KeyboardInterrupt: raise except: logger.warn("process failed %s %s reason %s" % (input, output, traceback.format_exc())) logger.info("Convert result %s - %s", converted, total) # retry sql results = db.connect().execute(sql + limitsql) for row in filter_tbk_items(results): id, num_id, shop_type, jn_url, re_url = row if not re_url: sql = "insert into item_re (item_id, detail_url) values (%s, '%s')" % ( id, 'http://item.taobao.com/item.htm?id=%s&spm=2014.21315963.1.0' % num_id) db.execute(sql)
def crawl_item2(kwargs): #signal.signal(signal.SIGINT, signal.SIG_IGN) item = kwargs['item'] is_commit = kwargs['is_commit'] crawl_path = kwargs['crawl_path'] server_path = kwargs['server_path'] org_server_path = kwargs['org_server_path'] is_remove = kwargs['is_remove'] item_id = item[0] num_id = item[1] is_success = False crawl_result = ((item_id, { 'suc1': 0, 'count1': 0, 'suc': 0, 'count': 0 }), ) try: conn = get_db_engine(**kwargs).connect() try: items = conn.execute( "select html, desc_content from crawl_html where crawl_html.item_id=%s;" % item_id) result = list(items) if len(result) == 1: html = result[0][0] desc_content = result[0][1] html_obj = parse_html(html) thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src") if len(thumbImages) == 0: thumbImages = [ IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in html_obj.xpath("//ul[@id='J_UlThumb']//li/@style") ] # taobao @src to @data-src if not len(thumbImages): thumbImages = html_obj.xpath( "//ul[@id='J_UlThumb']//img/@data-src") if len(thumbImages) == 0: logger.error( "crawl item %s %s not found thumb images html size %s", item_id, num_id, len(html), extra={'tags': [ 'crawl_thumb_empty', ]}) return crawl_result r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M | re.S) tr = re.compile("(.*)_\d+x\d+\.jpg$") desc_thumbs = lazy_desc_thumbs = [] if desc_content: desc_html = r.subn(r'\2', desc_content)[0] desc_html_obj = parse_html(desc_html) if desc_html_obj is not None: desc_thumbs = desc_html_obj.xpath( "//*[not(@href)]/img[not(@data-ks-lazyload)]/@src") lazy_desc_thumbs = desc_html_obj.xpath( "//*[not(@href)]/img/@data-ks-lazyload") else: logger.warn("crawl item %s %s desc content is empty!", item_id, num_id, extra={'tags': [ 'crawl_nodesc', ]}) images = [] pos = 1 for url in thumbImages: images.append((tr.sub(r'\1', url), pos, 1)) pos += 1 for url in desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 2)) pos += 1 for url in lazy_desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 3)) pos += 1 logger.debug("crawling %s %s %s", item_id, num_id, images) item_crawler = ItemCrawler(item_id, num_id, crawl_path, server_path, org_server_path, kwargs['statshost'], kwargs['statsport']) item_crawler.crawl(images, ((710, 10000), ), is_commit, conn, is_remove) is_success = item_crawler.success crawl_result = ((item_id, item_crawler.summary), ) except Exception, e: logger.error("crawl item %s %s got exception %s", item_id, num_id, traceback.format_exc(), extra={'tags': [ 'crawl_exception', ]}) finally: conn.close() Statsd.update_stats("guang.crawl.downimgcount", crawl_result[0][1]['suc1'] + crawl_result[0][1]['suc'], host=kwargs['statshost'], port=kwargs['statsport']) if is_success: logger.info("crawl item %s %s success %s", item_id, num_id, crawl_result) Statsd.increment('guang.crawl.itemimg.succ', host=kwargs['statshost'], port=kwargs['statsport']) else: logger.warn("crawl item %s %s failed %s", item_id, num_id, crawl_result, extra={'tags': [ 'crawl_failed', ]}) Statsd.increment('guang.crawl.itemimg.failed', host=kwargs['statshost'], port=kwargs['statsport'])
def crawl_item2(kwargs): item = kwargs['item'] is_commit = kwargs['is_commit'] is_success = False item_id = item[0] num_id = item[2] crawl_result = ((item_id, (0, 0, 0, 0, 0, 0.0, 0)), ) tb = TaobaoHtml(item_id, num_id, max_comments=kwargs['max_comments']) db = None if is_commit: db = get_db_engine() try: logger.info("progress %s/%s id %s iid %s", kwargs['i'], kwargs['total'], item_id, num_id) tb.crawl() if tb.is_offline and is_commit: db.execute("update item set status=2, modified=now() where id=%s" % item_id) if tb.detailDiv and not tb.is_offline: tb.crawl_price() if is_commit: # check old price and volume pv = list( db.execute("select price, volume from item where id=%s", item_id)) price = pv[0][0] volume = pv[0][1] if tb.price != price and tb.price > 0.001: is_price_update = True else: is_price_update = False if tb.volume > 0 and tb.volume != volume: is_volume_update = True else: is_volume_update = False if is_price_update: db.execute( "insert into price_update_track (item_id,time) values (%s,now()) on duplicate key update time=now()" % item_id) if is_volume_update: db.execute( "update item set price=%s, volume=%s where id=%s", tb.price, tb.volume, item_id) else: db.execute("update item set price=%s where id=%s", tb.price, item_id) elif is_volume_update: db.execute("update item set volume=%s where id=%s", tb.volume, item_id) if is_price_update: Statsd.increment("taobao.crawl.price_update") if is_volume_update: Statsd.increment("taobao.crawl.volume_update") if FLAGS.update_main: tb.crawl_desc() if len(tb.thumbImages) > 0 and is_commit and FLAGS.commit_html: db.execute("delete from crawl_html where item_id=%s" % item_id) db.execute( "insert into crawl_html (item_id,desc_url,promo_url,html,desc_content,promo_content,result,reason) values (%s, %s, %s, %s, %s, %s, %s, %s)", item_id, tb.descUrl, tb.promoteUrl, tb.data.decode('gb18030').encode('utf8'), tb.descContent.decode('gb18030').encode('utf8'), tb.promoteContent.decode('gb18030').encode('utf8'), 1, "") db.execute("update item set crawl_status=1 where id=%s" % item_id) Statsd.increment("taobao.crawl.html_update") ############### processing comments ########### if FLAGS.update_comments: rediscli = get_redis(FLAGS.redishost, FLAGS.redisport) key = "guang:rate:%s" % item_id l = rediscli.llen(key) tb.crawl_rate() logger.info("replace comments %s %s -> %s", item_id, l, len(tb.comments)) #rediscli.lrange(key, 0, l) rediscli.delete(key) for c in tb.comments: rediscli.rpush(key, c.SerializeToString()) # if limit size #p = rediscli.pipeline() #p.rpush(key, c.SerializeToString()) #p.ltrim(0, 99) #p.execute() Statsd.increment("taobao.crawl.comments_update") Statsd.update_stats("taobao.crawl.comments_update_total", len(tb.comments)) is_success = True crawl_result = ((item_id, (len(tb.data), len(tb.promoteContent), len(tb.descContent), len(tb.thumbImages), len(tb.buyButton), tb.price, len(tb.comments))), ) else: logger.warn("crawl %s failed, no detail content or is_offline=%s", item_id, tb.is_offline) crawl_result = ((item_id, (len(tb.data), 0, 0, 0, 0, 0.0, 0)), ) except: logger.error("crawling %s unknown exception %s", item_id, traceback.format_exc(), extra={'tags': [ 'crawlItemException', ]}) logger.info("crawling %s result %s - %s", item_id, is_success, crawl_result) if is_success: Statsd.increment("taobao.crawl.itemhtml.succ") else: Statsd.increment("taobao.crawl.itemhtml.failed") return crawl_result
def download_image(self, url): t = time.time() data = download(url, headers=self.headers) spent = time.time() - t Statsd.timing("guang.crawl.image", spent * 1000, host=self.statshost, port=self.statsport) return data
def post_one(db, user, select_sql, update_succ_sql, update_fail_sql, table_prefix): now = datetime.datetime.now() next = datetime.datetime(2020, 1, 1) total = 0 succ = 0 failed = 0 skip = 0 logger.debug("querying %s", select_sql) results = db.execute(select_sql) logger.debug("processing post for user %s, total %s, sql %s", user, results.rowcount, select_sql) for item in results: if FLAGS.postinterval: time.sleep(FLAGS.postinterval) total += 1 logger.debug("processing post %s/%s/%s/%s for user %s", succ, skip, total, results.rowcount, user) post_id = item[0] filename = item[2] post_ts = item[3] qqid = item[4] cookie_file = item[5] sid = item[6] #grplevel = item[7] content = preprocess_content(item[1], sid, post_ts, table_prefix, post_id) # reselect cookie cookie_result = list( db.execute("select cookies from wb_qq_account where qqid=%s" % qqid)) if cookie_result and cookie_result[0][0] != cookie_file: cookie_file = cookie_result[0][0] if post_ts <= now and cookie_file: logger.info("Preparing posting %s/%s %s qq %s sid %s %s %s @ %s", total, results.rowcount, post_id, qqid, sid, content.encode('utf8'), filename.encode('utf8'), post_ts) result = post_shuoshuo_string(cookie_file, filename, content, sid=sid, post_id=post_id) if not FLAGS.dryrun: if result: succ += 1 db.execute(update_succ_sql % post_id) Statsd.increment('guang.qzonepost.succ') elif FLAGS.commitfail: failed += 1 db.execute(update_fail_sql % post_id) Statsd.increment('guang.qzonepost.fail') log_paperboy("post timeout xks %s post_id %s" % (sid, post_id)) else: if FLAGS.timer: logger.info( "Preparing posting timer %s/%s %s qq %s sid %s %s %s @ %s", total, results.rowcount, post_id, qqid, sid, content.encode('utf8'), filename.encode('utf8'), post_ts) result = post_shuoshuo_string( cookie_file, filename, content, sid=sid, schedule_ts=int(time.mktime(post_ts.timetuple())), post_id=post_id) if not FLAGS.dryrun: if result: succ += 1 db.execute(update_succ_sql % post_id) Statsd.increment('guang.qzonepost.succ') else: skip += 1 next = min(post_ts, next) Statsd.increment('guang.qzonepost.timerfail') else: skip += 1 next = min(post_ts, next) logger.debug("Skiping post %s %s, scheduled @ %s", content.encode('utf8'), filename.encode('utf8'), post_ts) if total > 0: logger.info( "Batch result total %s skip %s succ %s failed %s next schedule %s", total, skip, succ, failed, next)