def crawl_one_shop(shop, failed): try: is_commit = shop['is_commit'] shop_id = shop['shop'][0] shop_url = shop['shop'][1] shop_type = shop['shop'][4] shop_nick = shop['shop'][5] tb = TaobaoListHtml(shop_id, shop_url) tb.crawl() logger.debug("crawl result %s count %s total %s", tb.id, tb.count, len(tb.total_items)) if is_commit: batch_size=100 total_items = tb.total_items db = get_rawdb_conn() update_shop_items(batch_size, db, shop_id, total_items) update_taobao_volume(db, shop_id, shop_type, total_items) db.close() Statsd.increment('guang.crawl.shop_list_succ') except ShopOfflineException: #double check shop status by taobao api shopinfo = get_taobao_shops(get_rand_top(), shop_nick) if shopinfo.get('error', 0) == 560 and is_commit: db = get_rawdb_conn() do_query(db, "update shop set status=2 where id=%s" % shop_id) db.commit() db.close() except: Statsd.increment('guang.crawl.shop_list_failed') logger.error("crawl shop failed %s %s", shop_id, traceback.format_exc(), extra={'tags':['crawlShopException',]}) failed.append({'shopid':shop_id, 'err':traceback.format_exc()})
def check(shop, failed): shopid = shop[0] shop_url = shop[1] shop_nick = shop[5].encode("utf-8") shopinfo = get_taobao_shops(get_rand_top(), [shop_nick]) if not shopinfo: failed.append(shopid) logger.error("shop %s url %s : not is taobaoke", shopid, shop_url)
def check(shop, failed): shopid = shop[0] shop_url = shop[1] shop_nick = shop[5].encode('utf-8') shopinfo = get_taobao_shops(get_rand_top(), [shop_nick]) if not shopinfo: failed.append(shopid) logger.error("shop %s url %s : not is taobaoke", shopid, shop_url)
def process_shop(db, shop, failed): id, url, level, nick, sid, cid, taobao_created, taobao_modified, taobao_title, item_score, service_score, delivery_score = shop try: shopinfo = get_taobao_shops(get_rand_top(), nick) if shopinfo.get("error", 0) == 560: logger.warn("Shop nick maybe error! %s", id) new_shop = {} if shopinfo.has_key('shop'): new_shop['sid'] = shopinfo['shop']['sid'] new_shop['cid'] = shopinfo['shop']['cid'] new_shop['delivery_score'] = int( float(shopinfo['shop']['shop_score']['delivery_score']) * 10) new_shop['item_score'] = int( float(shopinfo['shop']['shop_score']['item_score']) * 10) new_shop['service_score'] = int( float(shopinfo['shop']['shop_score']['service_score']) * 10) new_shop['taobao_created'] = shopinfo['shop']['created'] new_shop['taobao_modified'] = shopinfo['shop']['modified'] new_shop['taobao_title'] = shopinfo['shop']['title'] tb = TaobaoListHtml(id, url) tb.crawl(maxpage=1) if url.startswith('http://shop'): db.execute('update shop set nick_url="%s" where id=%s', tb.nick_url[0], id) logger.debug("nick url is %s", tb.nick_url[0]) new_shop['level'] = tb.get_level() update_fields = [] for key in new_shop: old_val = locals()[key] if new_shop[key] != old_val: update_fields.append((key, new_shop[key], old_val)) if update_fields: update_sql = "update shop set %s where id=%s" % (",".join( [get_set_sql(f) for f in update_fields]), id) logger.debug(update_sql) db.execute(update_sql) except KeyboardInterrupt: raise except: logger.warn("update shop(id=%s) level hash unknown exception %s", id, traceback.format_exc()) failed.append(traceback.format_exc()) return None
def crawl_one_shop(shop, failed): try: is_commit = shop['is_commit'] shop_id = shop['shop'][0] shop_url = shop['shop'][1] shop_type = shop['shop'][4] shop_nick = shop['shop'][5].encode('utf-8') tb = TaobaoListHtml(shop_id, shop_url) tb.crawl() logger.debug("crawl result %s count %s total %s", tb.id, tb.count, len(tb.total_items)) if is_commit: batch_size = 100 total_items = tb.total_items db = get_rawdb_conn() update_shop_items(batch_size, db, shop_id, total_items) update_taobao_volume(db, shop_id, shop_type, total_items) db.close() Statsd.increment('guang.crawl.shop_list_succ') except ShopOfflineException: #double check shop status by taobao api shopinfo = get_taobao_shops(get_rand_top(), [shop_nick]) if not shopinfo and is_commit: """ db = get_rawdb_conn() do_query(db, "update shop set status=2 where id=%s" % shop_id) db.commit() db.close() """ logger.warning("Shop %s: %s not is taobaoke", shop_id, shop_url) else: logger.error("Shop %s: %s url is error!", shop_id, shop_url) except: Statsd.increment('guang.crawl.shop_list_failed') logger.error("crawl shop failed %s %s", shop_id, traceback.format_exc(), extra={'tags': [ 'crawlShopException', ]}) failed.append({'shopid': shop_id, 'err': traceback.format_exc()})
def process_shop(db, shop, failed): if FLAGS.debug_parser: import pdb pdb.set_trace() id,url,level,nick,sid,cid,taobao_created,taobao_modified,taobao_title,item_score,service_score,delivery_score = shop try: shopinfo = get_taobao_shops(get_rand_top(), nick) if shopinfo.get("error", 0) == 560: logger.warn("Shop nick maybe error! %s", id) new_shop = {} if shopinfo.has_key('shop'): new_shop['sid'] = shopinfo['shop']['sid'] new_shop['cid'] = shopinfo['shop']['cid'] new_shop['delivery_score'] = int(float(shopinfo['shop']['shop_score']['delivery_score']) * 10) new_shop['item_score'] = int(float(shopinfo['shop']['shop_score']['item_score']) * 10) new_shop['service_score'] = int(float(shopinfo['shop']['shop_score']['service_score']) * 10) new_shop['taobao_created'] = shopinfo['shop']['created'] new_shop['taobao_modified'] = shopinfo['shop']['modified'] new_shop['taobao_title'] = shopinfo['shop']['title'] tb = TaobaoListHtml(id, url) tb.crawl(maxpage=1) if url.startswith('http://shop'): db.execute('update shop set nick_url="%s" where id=%s', tb.nick_url[0], id) logger.debug("nick url is %s", tb.nick_url[0]) new_shop['level'] = tb.get_level() update_fields = [] for key in new_shop: old_val = locals()[key] if new_shop[key] != old_val: update_fields.append((key, new_shop[key], old_val)) if update_fields: update_sql = "update shop set %s where id=%s" % (",".join([get_set_sql(f) for f in update_fields]), id) logger.debug(update_sql) db.execute(update_sql) except KeyboardInterrupt: raise except: logger.warn("update shop(id=%s) level hash unknown exception %s", id, traceback.format_exc()) failed.append(traceback.format_exc()) return None
def check_one_shop(shop, failed): shopid = shop[0] shop_url = shop[1] shop_nick = shop[5] shopinfo = get_taobao_shops(get_rand_top(), shop_nick) db = get_db_engine() try: tb = TaobaoListHtml(shopid, shop_url) tb.crawl(maxpage=1) page_len = tb.count except ShopOfflineException: page_len = 0 if shopinfo.get('error', 0) == 560: logger.error("Shop %s url is offline! %s", shopid, shop_url) db.execute("update shop set status=2 where id=%s", shopid) else: logger.error("Shop %s url is error! %s --> %s", shopid, shop_url, shopinfo) compare_item_indb(db, page_len, shop_url, shopid)