def crawl_one_shop(shop, tb_category, term_factory, db): shop_id = shop[0] shop_type = shop[1] shop_url = shop[2] # dsp 投放使用 defaultCampaign = list( db.execute( "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0" % shop_id)) if not defaultCampaign: logger.error("can not get the default campaign for shop: %s", shop_id) return """ 1.setting shop crawl_status=2 2.crawler 3.setting shop crawl_status=0 """ db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_CRAWLING, shop_id) # 店铺的所有商品num id,从新品列表抓取获得,这里注意:可能有多条记录 allTbNumIds = list( db.execute("SELECT itemids FROM tb_shop_item WHERE shopid = %s", shop_id)) tb_numids = [] for ids in allTbNumIds: tb_numids.extend(ids[0].split(',')) tb_numids_set = set(tb_numids) logger.info("crawling shop: %s %s, taobao online num %s", shop_id, shop_url, len(tb_numids_set)) # 过滤 new_numids, back_online_numids, offline_numids, db_dead_numids = filterNumIds( db, shop_id, tb_numids_set) logger.info( "stat taobao shop %s: new_num:%s, back_online_num:%s, offline_num:%s" % (shop_id, len(new_numids), len(back_online_numids), len(offline_numids))) pic_down_failed_num = 0 success_num = 0 failed_num = 0 offline_num = 0 dead_num = 0 if len(new_numids) > 0: for num_id in new_numids: if num_id in db_dead_numids: dead_num += 1 continue try: item = TaobaoItem(shop_id, 0, num_id) if shop_type == 2: item.is_tmall = True item.crawl_title() # ---> if not item.data: failed_num += 1 logger.warning("crawler %s network connection failure", num_id) continue if item.is_offline: db.execute( "update item set status=2, modified=now() where shop_id=%s and num_id=%s", shop_id, num_id) logger.warning("crawler %s off line", num_id) offline_num += 1 continue item.status = ITEM_STATUS_ACTIVE item.setCampaign(defaultCampaign) # ---> if item.cid: item.category = tb_category.getCategoryPath(item.cid) item.termIds = item.matchTaobaoTerms(term_factory) # ---> item.setPicUrl() # ---> # 图片下载失败的,下次轮询再处理 if not item.is_pic_download: pic_down_failed_num += 1 continue if item.volume < 0: item.volume = 0 item.db_create(db) success_num += 1 except: failed_num += 1 logger.error("crawling %s unknown exception %s", num_id, traceback.format_exc(), extra={'tags': [ 'crawlItemException', ]}) logger.info( "shop %s crawler: success %s, failed %s, offline %s, pic download failed %s, dead %s", shop_id, success_num, failed_num, offline_num, pic_down_failed_num, dead_num) if back_online_numids: db.execute( "update item set status=1 where shop_id=%s and num_id in (%s)", shop_id, ', '.join("'" + str(s) + "'" for s in back_online_numids)) logger.info("shop %s crawler: back online %s", shop_id, len(back_online_numids)) if offline_numids: db.execute( "update item set status=2 where shop_id=%s and num_id in (%s)", shop_id, ', '.join("'" + str(s) + "'" for s in offline_numids)) logger.info("shop %s crawler: offline %s", shop_id, len(offline_numids)) #抓取失败比较多的,重新抓取 if failed_num > 5: db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_WAIT, shop_id) else: db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_NONE, shop_id) # 以下操作是供统计使用,type=0:新增,1:下架;2:上架 if len(new_numids) > 0: for num_id in new_numids: db.execute( "INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 0) if len(back_online_numids) > 0: for num_id in back_online_numids: db.execute( "INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 2) if len(offline_numids) > 0: for num_id in offline_numids: db.execute( "INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 1)
def crawl_one_shop(shop, tb_category, term_factory, db): shop_id = shop[0] shop_type = shop[1] shop_url = shop[2] # dsp 投放使用 defaultCampaign = list(db.execute( "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0" % shop_id)) if not defaultCampaign: logger.error("can not get the default campaign for shop: %s", shop_id) return """ 1.setting shop crawl_status=2 2.crawler 3.setting shop crawl_status=0 """ db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_CRAWLING, shop_id) # 店铺的所有商品num id,从新品列表抓取获得,这里注意:可能有多条记录 allTbNumIds = list(db.execute("SELECT itemids FROM tb_shop_item WHERE shopid = %s", shop_id)) tb_numids = [] for ids in allTbNumIds: tb_numids.extend(ids[0].split(',')) tb_numids_set = set(tb_numids) logger.info("crawling shop: %s %s, taobao online num %s", shop_id, shop_url, len(tb_numids_set)) # 过滤 new_numids, back_online_numids, offline_numids, db_dead_numids = filterNumIds(db, shop_id, tb_numids_set) logger.info("stat taobao shop %s: new_num:%s, back_online_num:%s, offline_num:%s" % (shop_id, len(new_numids), len(back_online_numids), len(offline_numids))) pic_down_failed_num = 0 success_num = 0 failed_num = 0 offline_num = 0 dead_num = 0 if len(new_numids) > 0: for num_id in new_numids: if num_id in db_dead_numids: dead_num += 1 continue try: item = TaobaoItem(shop_id, 0, num_id) if shop_type == 2: item.is_tmall = True item.crawl_title() # ---> if not item.data: failed_num += 1 logger.warning("crawler %s network connection failure", num_id) continue if item.is_offline: db.execute("update item set status=2, modified=now() where shop_id=%s and num_id=%s", shop_id, num_id) logger.warning("crawler %s off line", num_id) offline_num += 1 continue item.status = ITEM_STATUS_ACTIVE item.setCampaign(defaultCampaign) # ---> if item.cid: item.category = tb_category.getCategoryPath(item.cid) item.termIds = item.matchTaobaoTerms(term_factory) # ---> item.setPicUrl() # ---> # 图片下载失败的,下次轮询再处理 if not item.is_pic_download: pic_down_failed_num += 1 continue if item.volume < 0: item.volume = 0 item.db_create(db) success_num += 1 except: failed_num += 1 logger.error("crawling %s unknown exception %s", num_id, traceback.format_exc(), extra={'tags':['crawlItemException',]}) logger.info("shop %s crawler: success %s, failed %s, offline %s, pic download failed %s, dead %s", shop_id, success_num, failed_num, offline_num, pic_down_failed_num, dead_num) if back_online_numids: db.execute("update item set status=1 where shop_id=%s and num_id in (%s)", shop_id, ', '.join("'" + str(s) + "'" for s in back_online_numids)) logger.info("shop %s crawler: back online %s", shop_id, len(back_online_numids)) if offline_numids: db.execute("update item set status=2 where shop_id=%s and num_id in (%s)", shop_id, ', '.join("'" + str(s) + "'" for s in offline_numids)) logger.info("shop %s crawler: offline %s", shop_id, len(offline_numids)) #抓取失败比较多的,重新抓取 if failed_num > 5: db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_WAIT, shop_id) else: db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_NONE, shop_id) # 以下操作是供统计使用,type=0:新增,1:下架;2:上架 if len(new_numids) > 0: for num_id in new_numids: db.execute("INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 0) if len(back_online_numids) > 0: for num_id in back_online_numids: db.execute("INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 2) if len(offline_numids) > 0: for num_id in offline_numids: db.execute("INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 1)
def crawl_one_shop(shop, tb_category, term_factory, db): shop_id = shop[0] shop_type = shop[1] shop_url = shop[2] shop_termLimits = shop[3] # 白名单模式暂时没有使用上,shop.mode defaultCampaign = list( db.execute( "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0" % shop_id ) ) if not defaultCampaign: logger.error("can not get the default campaign for shop: %s", shop_id) return """ 1.setting shop crawl_status=2 2.crawler 3.setting shop crawl_status=0 """ db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_CRAWLING, shop_id) # 店铺的所有商品num id,从新品列表抓取获得,这里注意:可能有多条记录 allTbNumIds = list(db.execute("SELECT itemids FROM tb_shop_item WHERE shopid = %s", shop_id)) tb_numids = [] for ids in allTbNumIds: tb_numids.extend(ids[0].split(",")) tb_numids_set = set(tb_numids) logger.info("crawling shop: %s %s, taobao online num %s", shop_id, shop_url, len(tb_numids_set)) # 过滤 new_numids_set, offShelf_numids_set, common_numids_set = filterNumIds(db, shop_id, tb_numids_set) logger.info( "stat taobao shop %s: new_num:%s, offline_num:%s, common_num:%s" % (shop_id, len(new_numids_set), len(offShelf_numids_set), len(common_numids_set)) ) new_items = [] off2on_items = [] black_num = 0 pic_down_failed_num = 0 if len(new_numids_set) > 0: new_item_list = doCrawl(shop_id, new_numids_set) if new_item_list: for dict_item in new_item_list: num_id = str(dict_item["num_iid"]) # n_cid = dict_item['cid'] tb_title = dict_item["title"].encode("utf-8") tb_detail_url = dict_item["item_url"].encode("utf-8") tb_price = float(dict_item["price"]) if dict_item["pic_url"]: tb_pic_url = str(dict_item["pic_url"]) else: logger.warn("taobao item %s not pic_url", tb_detail_url) continue volume = 0 if dict_item.has_key("volume"): volume = dict_item["volume"] try: # 检查该商品是否重新上架 db_item = list( db.execute( "select id, title, pic_url, local_pic_url, price, manual_set, status, category, volume from item where shop_id=%s and num_id='%s' and status!=1" % (shop_id, num_id) ) ) if db_item: # update db_volume = db_item[0][8] # db_categroy = db_item[0][7] db_status = int(db_item[0][6]) db_manual_set = int(db_item[0][5]) db_price = float(db_item[0][4]) db_local_pic_url = db_item[0][3] db_pic_url = db_item[0][2] db_title = db_item[0][1].encode("utf-8") db_item_id = int(db_item[0][0]) if db_status == ITEM_STATUS_BLACKLIST: black_num += 1 continue item = TaobaoItem(shop_id, db_item_id, num_id) item.status = ITEM_STATUS_ACTIVE # 先置为上线状态,再检查其他属性是否有变化 if db_volume < volume: item.volume = volume # if not db_categroy: # item.category = tb_category.getCategoryPath(n_cid) # 人工设置了图片和title if db_manual_set == 1: # 检查价格 if tb_price != db_price and quickUpdatePrice(db_item_id, db): item.price = tb_price if not imgExists(shop_id, db_local_pic_url): # 图片不存在,需要重新下载 item.detail_url = tb_detail_url item.local_pic_url = db_local_pic_url item.setPicUrl(tb_pic_url) if not item.is_pic_download: pic_down_failed_num += 1 continue else: if tb_title != db_title: item.title = tb_title if tb_price != db_price and quickUpdatePrice(db_item_id, db): item.price = tb_price # 图片路径有变化,或者原图片不存在了,都需要重新下载 if tb_pic_url != db_pic_url or not imgExists(shop_id, db_local_pic_url): item.detail_url = tb_detail_url item.local_pic_url = db_local_pic_url item.setPicUrl(tb_pic_url) if not item.is_pic_download: pic_down_failed_num += 1 continue # TODO # dbItem是下线状态,可能要重新匹配terms, # 原来下线时并没有删除对应的item_term, 但不排除其他渠道删除,以后有需求再处理 # item.db_update(db) off2on_items.append(num_id) else: # add item = TaobaoItem(shop_id, 0, num_id) item.title = tb_title item.detail_url = tb_detail_url.replace("spm=(\\.|\\d)*", "spm=2014.12669715.0.0") item.price = tb_price item.volume = volume # item.category = tb_category.getCategoryPath(n_cid) # ---> item.termIds = item.matchTaobaoTerms(term_factory, str(shop_termLimits)) # ---> item.setPicUrl(tb_pic_url) # ---> item.setCampaign(defaultCampaign) # ---> item.status = ITEM_STATUS_ACTIVE # 图片下载失败的,下次轮询再处理 if not item.is_pic_download: pic_down_failed_num += 1 continue item.db_create(db) new_items.append(num_id) except: logger.error("%s: %s creating failed %s", shop_id, num_id, traceback.format_exc()) continue logger.info( "shop %s crawler: new %s, back on line %s, black %s, pic download failed %s", shop_id, len(new_items), len(off2on_items), black_num, pic_down_failed_num, ) if offShelf_numids_set: # offline db.execute( "update item set status=2 where num_id in (%s)" % ", ".join("'" + str(s) + "'" for s in offShelf_numids_set) ) logger.info("shop %s crawler: offline %s", shop_id, len(offShelf_numids_set)) db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_NONE, shop_id) # 以下操作是供统计使用,type=0:新增,1:下架;2:上架 if len(new_items) > 0: for num_id in new_items: db.execute("INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 0) if len(off2on_items) > 0: for num_id in off2on_items: db.execute("INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 2) if len(offShelf_numids_set) > 0: for num_id in offShelf_numids_set: db.execute("INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 1)
def crawl_one_shop(shop, tb_category, term_factory, db): shop_id = shop[0] shop_type = shop[1] shop_url = shop[2] shop_termLimits = shop[3] # 白名单模式暂时没有使用上,shop.mode defaultCampaign = list( db.execute( "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0" % shop_id)) if not defaultCampaign: logger.error("can not get the default campaign for shop: %s", shop_id) return """ 1.setting shop crawl_status=2 2.crawler 3.setting shop crawl_status=0 """ db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_CRAWLING, shop_id) # 店铺的所有商品num id,从新品列表抓取获得,这里注意:可能有多条记录 allTbNumIds = list( db.execute("SELECT itemids FROM tb_shop_item WHERE shopid = %s", shop_id)) tb_numids = [] for ids in allTbNumIds: tb_numids.extend(ids[0].split(',')) tb_numids_set = set(tb_numids) logger.info("crawling shop: %s %s, taobao online num %s", shop_id, shop_url, len(tb_numids_set)) # 过滤 new_numids_set, offShelf_numids_set, common_numids_set = filterNumIds( db, shop_id, tb_numids_set) logger.info( "stat taobao shop %s: new_num:%s, offline_num:%s, common_num:%s" % (shop_id, len(new_numids_set), len(offShelf_numids_set), len(common_numids_set))) new_items = [] off2on_items = [] black_num = 0 pic_down_failed_num = 0 if len(new_numids_set) > 0: new_item_list = doCrawl(shop_id, new_numids_set) if new_item_list: for dict_item in new_item_list: num_id = str(dict_item['num_iid']) #n_cid = dict_item['cid'] tb_title = dict_item['title'].encode('utf-8') tb_detail_url = dict_item['item_url'].encode('utf-8') tb_price = float(dict_item['price']) if dict_item['pic_url']: tb_pic_url = str(dict_item['pic_url']) else: logger.warn("taobao item %s not pic_url", tb_detail_url) continue volume = 0 if dict_item.has_key('volume'): volume = dict_item['volume'] try: #检查该商品是否重新上架 db_item = list( db.execute( "select id, title, pic_url, local_pic_url, price, manual_set, status, category, volume from item where shop_id=%s and num_id='%s' and status!=1" % (shop_id, num_id))) if db_item: #update db_volume = db_item[0][8] #db_categroy = db_item[0][7] db_status = int(db_item[0][6]) db_manual_set = int(db_item[0][5]) db_price = float(db_item[0][4]) db_local_pic_url = db_item[0][3] db_pic_url = db_item[0][2] db_title = db_item[0][1].encode('utf-8') db_item_id = int(db_item[0][0]) if db_status == ITEM_STATUS_BLACKLIST: black_num += 1 continue item = TaobaoItem(shop_id, db_item_id, num_id) item.status = ITEM_STATUS_ACTIVE # 先置为上线状态,再检查其他属性是否有变化 if db_volume < volume: item.volume = volume #if not db_categroy: # item.category = tb_category.getCategoryPath(n_cid) # 人工设置了图片和title if db_manual_set == 1: #检查价格 if tb_price != db_price and quickUpdatePrice( db_item_id, db): item.price = tb_price if not imgExists(shop_id, db_local_pic_url): # 图片不存在,需要重新下载 item.detail_url = tb_detail_url item.local_pic_url = db_local_pic_url item.setPicUrl(tb_pic_url) if not item.is_pic_download: pic_down_failed_num += 1 continue else: if tb_title != db_title: item.title = tb_title if tb_price != db_price and quickUpdatePrice( db_item_id, db): item.price = tb_price # 图片路径有变化,或者原图片不存在了,都需要重新下载 if tb_pic_url != db_pic_url or not imgExists( shop_id, db_local_pic_url): item.detail_url = tb_detail_url item.local_pic_url = db_local_pic_url item.setPicUrl(tb_pic_url) if not item.is_pic_download: pic_down_failed_num += 1 continue # TODO # dbItem是下线状态,可能要重新匹配terms, # 原来下线时并没有删除对应的item_term, 但不排除其他渠道删除,以后有需求再处理 # item.db_update(db) off2on_items.append(num_id) else: #add item = TaobaoItem(shop_id, 0, num_id) item.title = tb_title item.detail_url = tb_detail_url.replace( "spm=(\\.|\\d)*", "spm=2014.12669715.0.0") item.price = tb_price item.volume = volume #item.category = tb_category.getCategoryPath(n_cid) # ---> item.termIds = item.matchTaobaoTerms( term_factory, str(shop_termLimits)) # ---> item.setPicUrl(tb_pic_url) # ---> item.setCampaign(defaultCampaign) # ---> item.status = ITEM_STATUS_ACTIVE # 图片下载失败的,下次轮询再处理 if not item.is_pic_download: pic_down_failed_num += 1 continue item.db_create(db) new_items.append(num_id) except: logger.error("%s: %s creating failed %s", shop_id, num_id, traceback.format_exc()) continue logger.info( "shop %s crawler: new %s, back on line %s, black %s, pic download failed %s", shop_id, len(new_items), len(off2on_items), black_num, pic_down_failed_num) if offShelf_numids_set: #offline db.execute("update item set status=2 where num_id in (%s)" % ', '.join("'" + str(s) + "'" for s in offShelf_numids_set)) logger.info("shop %s crawler: offline %s", shop_id, len(offShelf_numids_set)) db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_NONE, shop_id) # 以下操作是供统计使用,type=0:新增,1:下架;2:上架 if len(new_items) > 0: for num_id in new_items: db.execute( "INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 0) if len(off2on_items) > 0: for num_id in off2on_items: db.execute( "INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 2) if len(offShelf_numids_set) > 0: for num_id in offShelf_numids_set: db.execute( "INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 1)