def crawl_one_shop(shop, failed): try: is_commit = shop['is_commit'] shop_id = shop['shop'][0] shop_url = shop['shop'][1] shop_type = shop['shop'][4] shop_nick = shop['shop'][5] tb = TaobaoListHtml(shop_id, shop_url) tb.crawl() logger.debug("crawl result %s count %s total %s", tb.id, tb.count, len(tb.total_items)) if is_commit: batch_size=100 total_items = tb.total_items db = get_rawdb_conn() update_shop_items(batch_size, db, shop_id, total_items) update_taobao_volume(db, shop_id, shop_type, total_items) db.close() Statsd.increment('guang.crawl.shop_list_succ') except ShopOfflineException: #double check shop status by taobao api shopinfo = get_taobao_shops(get_rand_top(), shop_nick) if shopinfo.get('error', 0) == 560 and is_commit: db = get_rawdb_conn() do_query(db, "update shop set status=2 where id=%s" % shop_id) db.commit() db.close() except: Statsd.increment('guang.crawl.shop_list_failed') logger.error("crawl shop failed %s %s", shop_id, traceback.format_exc(), extra={'tags':['crawlShopException',]}) failed.append({'shopid':shop_id, 'err':traceback.format_exc()})
def main(): db = get_rawdb_conn() logger.debug("querying") db.query( "select item_id, result, is_image_crawled, id from crawl_html where id>3000 order by id" ) results = db.store_result() i = 0 db.autocommit(False) db.query("set autocommit=0;") for row in results.fetch_row(maxrows=0): item_id = row[0] result = row[1] is_image_crawled = row[2] i += 1 if result == 1 and is_image_crawled == 1: try_query(db, "update item set crawl_status=2 where id=%s" % item_id) if result == 1 and is_image_crawled == 0: try_query(db, "update item set crawl_status=1 where id=%s" % item_id) if result == 0: try_query(db, "update item set crawl_status=0 where id=%s" % item_id) if i % 1000 == 0: logger.debug("processing %s %s %s/%s", row[3], item_id, i, 1194351) db.commit() db.commit() db.close()
def main(): db = get_rawdb_conn() logger.debug("querying") db.query("select item_id, result, is_image_crawled, id from crawl_html where id>3000 order by id") results = db.store_result() i = 0 db.autocommit(False) db.query("set autocommit=0;") for row in results.fetch_row(maxrows=0): item_id = row[0] result = row[1] is_image_crawled = row[2] i += 1 if result == 1 and is_image_crawled == 1: try_query(db, "update item set crawl_status=2 where id=%s" % item_id) if result == 1 and is_image_crawled == 0: try_query(db, "update item set crawl_status=1 where id=%s" % item_id) if result == 0: try_query(db, "update item set crawl_status=0 where id=%s" % item_id) if i % 1000 == 0: logger.debug("processing %s %s %s/%s", row[3], item_id, i, 1194351) db.commit() db.commit() db.close()
def crawl_one_shop(shop, failed): try: is_commit = shop['is_commit'] shop_id = shop['shop'][0] shop_url = shop['shop'][1] shop_type = shop['shop'][4] shop_nick = shop['shop'][5].encode('utf-8') tb = TaobaoListHtml(shop_id, shop_url) tb.crawl() logger.debug("crawl result %s count %s total %s", tb.id, tb.count, len(tb.total_items)) if is_commit: batch_size = 100 total_items = tb.total_items db = get_rawdb_conn() update_shop_items(batch_size, db, shop_id, total_items) update_taobao_volume(db, shop_id, shop_type, total_items) db.close() Statsd.increment('guang.crawl.shop_list_succ') except ShopOfflineException: #double check shop status by taobao api shopinfo = get_taobao_shops(get_rand_top(), [shop_nick]) if not shopinfo and is_commit: """ db = get_rawdb_conn() do_query(db, "update shop set status=2 where id=%s" % shop_id) db.commit() db.close() """ logger.warning("Shop %s: %s not is taobaoke", shop_id, shop_url) else: logger.error("Shop %s: %s url is error!", shop_id, shop_url) except: Statsd.increment('guang.crawl.shop_list_failed') logger.error("crawl shop failed %s %s", shop_id, traceback.format_exc(), extra={'tags': [ 'crawlShopException', ]}) failed.append({'shopid': shop_id, 'err': traceback.format_exc()})