示例#1
0
def crawl_one_shop(shop, tb_category, term_factory, db):
    shop_id = shop[0]
    shop_type = shop[1]
    shop_url = shop[2]

    # dsp 投放使用
    defaultCampaign = list(
        db.execute(
            "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0"
            % shop_id))
    if not defaultCampaign:
        logger.error("can not get the default campaign for shop: %s", shop_id)
        return
    """
        1.setting shop crawl_status=2
        2.crawler
        3.setting shop crawl_status=0
    """
    db.execute("update shop set crawl_status=%s where id=%s",
               SHOP_CRAWL_STATUS_CRAWLING, shop_id)

    # 店铺的所有商品num id,从新品列表抓取获得,这里注意:可能有多条记录
    allTbNumIds = list(
        db.execute("SELECT itemids FROM tb_shop_item WHERE shopid = %s",
                   shop_id))
    tb_numids = []
    for ids in allTbNumIds:
        tb_numids.extend(ids[0].split(','))
    tb_numids_set = set(tb_numids)
    logger.info("crawling shop: %s %s, taobao online num %s", shop_id,
                shop_url, len(tb_numids_set))

    # 过滤
    new_numids, back_online_numids, offline_numids, db_dead_numids = filterNumIds(
        db, shop_id, tb_numids_set)
    logger.info(
        "stat taobao shop %s: new_num:%s, back_online_num:%s, offline_num:%s" %
        (shop_id, len(new_numids), len(back_online_numids),
         len(offline_numids)))

    pic_down_failed_num = 0
    success_num = 0
    failed_num = 0
    offline_num = 0
    dead_num = 0
    if len(new_numids) > 0:
        for num_id in new_numids:
            if num_id in db_dead_numids:
                dead_num += 1
                continue

            try:
                item = TaobaoItem(shop_id, 0, num_id)
                if shop_type == 2:
                    item.is_tmall = True

                item.crawl_title()  # --->

                if not item.data:
                    failed_num += 1
                    logger.warning("crawler %s network connection failure",
                                   num_id)
                    continue
                if item.is_offline:
                    db.execute(
                        "update item set status=2, modified=now() where shop_id=%s and num_id=%s",
                        shop_id, num_id)
                    logger.warning("crawler %s off line", num_id)
                    offline_num += 1
                    continue

                item.status = ITEM_STATUS_ACTIVE
                item.setCampaign(defaultCampaign)  # --->
                if item.cid:
                    item.category = tb_category.getCategoryPath(item.cid)
                item.termIds = item.matchTaobaoTerms(term_factory)  # --->
                item.setPicUrl()  # --->
                # 图片下载失败的,下次轮询再处理
                if not item.is_pic_download:
                    pic_down_failed_num += 1
                    continue
                if item.volume < 0:
                    item.volume = 0
                item.db_create(db)
                success_num += 1

            except:
                failed_num += 1
                logger.error("crawling %s unknown exception %s",
                             num_id,
                             traceback.format_exc(),
                             extra={'tags': [
                                 'crawlItemException',
                             ]})
    logger.info(
        "shop %s crawler: success %s, failed %s, offline %s, pic download failed %s, dead %s",
        shop_id, success_num, failed_num, offline_num, pic_down_failed_num,
        dead_num)

    if back_online_numids:
        db.execute(
            "update item set status=1 where shop_id=%s and num_id in (%s)",
            shop_id, ', '.join("'" + str(s) + "'" for s in back_online_numids))
        logger.info("shop %s crawler: back online %s", shop_id,
                    len(back_online_numids))

    if offline_numids:
        db.execute(
            "update item set status=2 where shop_id=%s and num_id in (%s)",
            shop_id, ', '.join("'" + str(s) + "'" for s in offline_numids))
        logger.info("shop %s crawler: offline %s", shop_id,
                    len(offline_numids))

    #抓取失败比较多的,重新抓取
    if failed_num > 5:
        db.execute("update shop set crawl_status=%s where id=%s",
                   SHOP_CRAWL_STATUS_WAIT, shop_id)
    else:
        db.execute("update shop set crawl_status=%s where id=%s",
                   SHOP_CRAWL_STATUS_NONE, shop_id)

    # 以下操作是供统计使用,type=0:新增,1:下架;2:上架
    if len(new_numids) > 0:
        for num_id in new_numids:
            db.execute(
                "INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())",
                num_id, 0)
    if len(back_online_numids) > 0:
        for num_id in back_online_numids:
            db.execute(
                "INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())",
                num_id, 2)
    if len(offline_numids) > 0:
        for num_id in offline_numids:
            db.execute(
                "INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())",
                num_id, 1)
def crawl_one_shop(shop, tb_category, term_factory, db):
    shop_id = shop[0]
    shop_type = shop[1]
    shop_url = shop[2]

    # dsp 投放使用
    defaultCampaign = list(db.execute(
        "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0" % shop_id))
    if not defaultCampaign:
        logger.error("can not get the default campaign for shop: %s", shop_id)
        return

    """
        1.setting shop crawl_status=2
        2.crawler
        3.setting shop crawl_status=0
    """
    db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_CRAWLING, shop_id)

    # 店铺的所有商品num id,从新品列表抓取获得,这里注意:可能有多条记录
    allTbNumIds = list(db.execute("SELECT itemids FROM tb_shop_item WHERE shopid = %s", shop_id))
    tb_numids = []
    for ids in allTbNumIds:
        tb_numids.extend(ids[0].split(','))
    tb_numids_set = set(tb_numids)
    logger.info("crawling shop: %s %s, taobao online num %s", shop_id, shop_url, len(tb_numids_set))

    # 过滤
    new_numids, back_online_numids, offline_numids, db_dead_numids = filterNumIds(db, shop_id, tb_numids_set)
    logger.info("stat taobao shop %s: new_num:%s, back_online_num:%s, offline_num:%s" % (shop_id, len(new_numids), len(back_online_numids), len(offline_numids)))

    pic_down_failed_num = 0
    success_num = 0
    failed_num = 0
    offline_num = 0
    dead_num = 0
    if len(new_numids) > 0:
        for num_id in new_numids:
            if num_id in db_dead_numids:
                dead_num += 1
                continue

            try:
                item = TaobaoItem(shop_id, 0, num_id)
                if shop_type == 2:
                    item.is_tmall = True

                item.crawl_title()      # --->

                if not item.data:
                    failed_num += 1
                    logger.warning("crawler %s network connection failure", num_id)
                    continue
                if item.is_offline:
                    db.execute("update item set status=2, modified=now() where shop_id=%s and num_id=%s", shop_id, num_id)
                    logger.warning("crawler %s off line", num_id)
                    offline_num += 1
                    continue

                item.status = ITEM_STATUS_ACTIVE
                item.setCampaign(defaultCampaign)   # --->
                if item.cid:
                    item.category = tb_category.getCategoryPath(item.cid)
                item.termIds = item.matchTaobaoTerms(term_factory)    # --->
                item.setPicUrl()    # --->
                # 图片下载失败的,下次轮询再处理
                if not item.is_pic_download:
                    pic_down_failed_num += 1
                    continue
                if item.volume < 0:
                    item.volume = 0
                item.db_create(db)
                success_num += 1

            except:
                failed_num += 1
                logger.error("crawling %s unknown exception %s", num_id, traceback.format_exc(), extra={'tags':['crawlItemException',]})
    logger.info("shop %s crawler: success %s, failed %s, offline %s, pic download failed %s, dead %s", shop_id,
                success_num, failed_num, offline_num, pic_down_failed_num, dead_num)

    if back_online_numids:
        db.execute("update item set status=1 where shop_id=%s and num_id in (%s)", shop_id, ', '.join("'" + str(s) + "'" for s in back_online_numids))
        logger.info("shop %s crawler: back online %s", shop_id, len(back_online_numids))

    if offline_numids:
        db.execute("update item set status=2 where shop_id=%s and num_id in (%s)", shop_id, ', '.join("'" + str(s) + "'" for s in offline_numids))
        logger.info("shop %s crawler: offline %s", shop_id, len(offline_numids))

    #抓取失败比较多的,重新抓取
    if failed_num > 5:
        db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_WAIT, shop_id)
    else:
        db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_NONE, shop_id)

    # 以下操作是供统计使用,type=0:新增,1:下架;2:上架
    if len(new_numids) > 0:
        for num_id in new_numids:
            db.execute("INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 0)
    if len(back_online_numids) > 0:
        for num_id in back_online_numids:
            db.execute("INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 2)
    if len(offline_numids) > 0:
        for num_id in offline_numids:
            db.execute("INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 1)
示例#3
0
def crawl_one_shop(shop, tb_category, term_factory, db):
    shop_id = shop[0]
    shop_type = shop[1]
    shop_url = shop[2]
    shop_termLimits = shop[3]

    # 白名单模式暂时没有使用上,shop.mode

    defaultCampaign = list(
        db.execute(
            "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0"
            % shop_id
        )
    )
    if not defaultCampaign:
        logger.error("can not get the default campaign for shop: %s", shop_id)
        return

    """
        1.setting shop crawl_status=2
        2.crawler
        3.setting shop crawl_status=0
    """
    db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_CRAWLING, shop_id)

    # 店铺的所有商品num id,从新品列表抓取获得,这里注意:可能有多条记录
    allTbNumIds = list(db.execute("SELECT itemids FROM tb_shop_item WHERE shopid = %s", shop_id))
    tb_numids = []
    for ids in allTbNumIds:
        tb_numids.extend(ids[0].split(","))
    tb_numids_set = set(tb_numids)
    logger.info("crawling shop: %s %s, taobao online num %s", shop_id, shop_url, len(tb_numids_set))

    # 过滤
    new_numids_set, offShelf_numids_set, common_numids_set = filterNumIds(db, shop_id, tb_numids_set)
    logger.info(
        "stat taobao shop %s: new_num:%s, offline_num:%s, common_num:%s"
        % (shop_id, len(new_numids_set), len(offShelf_numids_set), len(common_numids_set))
    )

    new_items = []
    off2on_items = []
    black_num = 0
    pic_down_failed_num = 0
    if len(new_numids_set) > 0:
        new_item_list = doCrawl(shop_id, new_numids_set)
        if new_item_list:
            for dict_item in new_item_list:
                num_id = str(dict_item["num_iid"])
                # n_cid = dict_item['cid']
                tb_title = dict_item["title"].encode("utf-8")
                tb_detail_url = dict_item["item_url"].encode("utf-8")
                tb_price = float(dict_item["price"])
                if dict_item["pic_url"]:
                    tb_pic_url = str(dict_item["pic_url"])
                else:
                    logger.warn("taobao item %s not pic_url", tb_detail_url)
                    continue
                volume = 0
                if dict_item.has_key("volume"):
                    volume = dict_item["volume"]
                try:
                    # 检查该商品是否重新上架
                    db_item = list(
                        db.execute(
                            "select id, title, pic_url, local_pic_url, price, manual_set, status, category, volume from item where shop_id=%s and num_id='%s' and status!=1"
                            % (shop_id, num_id)
                        )
                    )
                    if db_item:
                        # update
                        db_volume = db_item[0][8]
                        # db_categroy = db_item[0][7]
                        db_status = int(db_item[0][6])
                        db_manual_set = int(db_item[0][5])
                        db_price = float(db_item[0][4])
                        db_local_pic_url = db_item[0][3]
                        db_pic_url = db_item[0][2]
                        db_title = db_item[0][1].encode("utf-8")
                        db_item_id = int(db_item[0][0])

                        if db_status == ITEM_STATUS_BLACKLIST:
                            black_num += 1
                            continue

                        item = TaobaoItem(shop_id, db_item_id, num_id)
                        item.status = ITEM_STATUS_ACTIVE  # 先置为上线状态,再检查其他属性是否有变化

                        if db_volume < volume:
                            item.volume = volume

                        # if not db_categroy:
                        #    item.category = tb_category.getCategoryPath(n_cid)

                        # 人工设置了图片和title
                        if db_manual_set == 1:
                            # 检查价格
                            if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                                item.price = tb_price
                            if not imgExists(shop_id, db_local_pic_url):
                                # 图片不存在,需要重新下载
                                item.detail_url = tb_detail_url
                                item.local_pic_url = db_local_pic_url
                                item.setPicUrl(tb_pic_url)
                                if not item.is_pic_download:
                                    pic_down_failed_num += 1
                                    continue
                        else:
                            if tb_title != db_title:
                                item.title = tb_title
                            if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                                item.price = tb_price
                                # 图片路径有变化,或者原图片不存在了,都需要重新下载
                            if tb_pic_url != db_pic_url or not imgExists(shop_id, db_local_pic_url):
                                item.detail_url = tb_detail_url
                                item.local_pic_url = db_local_pic_url
                                item.setPicUrl(tb_pic_url)
                                if not item.is_pic_download:
                                    pic_down_failed_num += 1
                                    continue

                        # TODO
                        # dbItem是下线状态,可能要重新匹配terms,
                        # 原来下线时并没有删除对应的item_term, 但不排除其他渠道删除,以后有需求再处理
                        #

                        item.db_update(db)
                        off2on_items.append(num_id)
                    else:
                        # add
                        item = TaobaoItem(shop_id, 0, num_id)
                        item.title = tb_title
                        item.detail_url = tb_detail_url.replace("spm=(\\.|\\d)*", "spm=2014.12669715.0.0")
                        item.price = tb_price
                        item.volume = volume
                        # item.category = tb_category.getCategoryPath(n_cid)      # --->
                        item.termIds = item.matchTaobaoTerms(term_factory, str(shop_termLimits))  # --->
                        item.setPicUrl(tb_pic_url)  # --->
                        item.setCampaign(defaultCampaign)  # --->
                        item.status = ITEM_STATUS_ACTIVE

                        # 图片下载失败的,下次轮询再处理
                        if not item.is_pic_download:
                            pic_down_failed_num += 1
                            continue
                        item.db_create(db)
                        new_items.append(num_id)
                except:
                    logger.error("%s: %s creating failed %s", shop_id, num_id, traceback.format_exc())
                    continue
    logger.info(
        "shop %s crawler: new %s, back on line %s, black %s, pic download failed %s",
        shop_id,
        len(new_items),
        len(off2on_items),
        black_num,
        pic_down_failed_num,
    )

    if offShelf_numids_set:
        # offline
        db.execute(
            "update item set status=2 where num_id in (%s)" % ", ".join("'" + str(s) + "'" for s in offShelf_numids_set)
        )
    logger.info("shop %s crawler: offline %s", shop_id, len(offShelf_numids_set))

    db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_NONE, shop_id)

    # 以下操作是供统计使用,type=0:新增,1:下架;2:上架
    if len(new_items) > 0:
        for num_id in new_items:
            db.execute("INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 0)
    if len(off2on_items) > 0:
        for num_id in off2on_items:
            db.execute("INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 2)
    if len(offShelf_numids_set) > 0:
        for num_id in offShelf_numids_set:
            db.execute("INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())", num_id, 1)
示例#4
0
def crawl_one_shop(shop, tb_category, term_factory, db):
    shop_id = shop[0]
    shop_type = shop[1]
    shop_url = shop[2]
    shop_termLimits = shop[3]

    # 白名单模式暂时没有使用上,shop.mode

    defaultCampaign = list(
        db.execute(
            "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0"
            % shop_id))
    if not defaultCampaign:
        logger.error("can not get the default campaign for shop: %s", shop_id)
        return
    """
        1.setting shop crawl_status=2
        2.crawler
        3.setting shop crawl_status=0
    """
    db.execute("update shop set crawl_status=%s where id=%s",
               SHOP_CRAWL_STATUS_CRAWLING, shop_id)

    # 店铺的所有商品num id,从新品列表抓取获得,这里注意:可能有多条记录
    allTbNumIds = list(
        db.execute("SELECT itemids FROM tb_shop_item WHERE shopid = %s",
                   shop_id))
    tb_numids = []
    for ids in allTbNumIds:
        tb_numids.extend(ids[0].split(','))
    tb_numids_set = set(tb_numids)
    logger.info("crawling shop: %s %s, taobao online num %s", shop_id,
                shop_url, len(tb_numids_set))

    # 过滤
    new_numids_set, offShelf_numids_set, common_numids_set = filterNumIds(
        db, shop_id, tb_numids_set)
    logger.info(
        "stat taobao shop %s: new_num:%s, offline_num:%s, common_num:%s" %
        (shop_id, len(new_numids_set), len(offShelf_numids_set),
         len(common_numids_set)))

    new_items = []
    off2on_items = []
    black_num = 0
    pic_down_failed_num = 0
    if len(new_numids_set) > 0:
        new_item_list = doCrawl(shop_id, new_numids_set)
        if new_item_list:
            for dict_item in new_item_list:
                num_id = str(dict_item['num_iid'])
                #n_cid = dict_item['cid']
                tb_title = dict_item['title'].encode('utf-8')
                tb_detail_url = dict_item['item_url'].encode('utf-8')
                tb_price = float(dict_item['price'])
                if dict_item['pic_url']:
                    tb_pic_url = str(dict_item['pic_url'])
                else:
                    logger.warn("taobao item %s not pic_url", tb_detail_url)
                    continue
                volume = 0
                if dict_item.has_key('volume'):
                    volume = dict_item['volume']
                try:
                    #检查该商品是否重新上架
                    db_item = list(
                        db.execute(
                            "select id, title, pic_url, local_pic_url, price, manual_set, status, category, volume from item where shop_id=%s and num_id='%s' and status!=1"
                            % (shop_id, num_id)))
                    if db_item:
                        #update
                        db_volume = db_item[0][8]
                        #db_categroy = db_item[0][7]
                        db_status = int(db_item[0][6])
                        db_manual_set = int(db_item[0][5])
                        db_price = float(db_item[0][4])
                        db_local_pic_url = db_item[0][3]
                        db_pic_url = db_item[0][2]
                        db_title = db_item[0][1].encode('utf-8')
                        db_item_id = int(db_item[0][0])

                        if db_status == ITEM_STATUS_BLACKLIST:
                            black_num += 1
                            continue

                        item = TaobaoItem(shop_id, db_item_id, num_id)
                        item.status = ITEM_STATUS_ACTIVE  # 先置为上线状态,再检查其他属性是否有变化

                        if db_volume < volume:
                            item.volume = volume

                        #if not db_categroy:
                        #    item.category = tb_category.getCategoryPath(n_cid)

                        # 人工设置了图片和title
                        if db_manual_set == 1:
                            #检查价格
                            if tb_price != db_price and quickUpdatePrice(
                                    db_item_id, db):
                                item.price = tb_price
                            if not imgExists(shop_id, db_local_pic_url):
                                # 图片不存在,需要重新下载
                                item.detail_url = tb_detail_url
                                item.local_pic_url = db_local_pic_url
                                item.setPicUrl(tb_pic_url)
                                if not item.is_pic_download:
                                    pic_down_failed_num += 1
                                    continue
                        else:
                            if tb_title != db_title:
                                item.title = tb_title
                            if tb_price != db_price and quickUpdatePrice(
                                    db_item_id, db):
                                item.price = tb_price
                                # 图片路径有变化,或者原图片不存在了,都需要重新下载
                            if tb_pic_url != db_pic_url or not imgExists(
                                    shop_id, db_local_pic_url):
                                item.detail_url = tb_detail_url
                                item.local_pic_url = db_local_pic_url
                                item.setPicUrl(tb_pic_url)
                                if not item.is_pic_download:
                                    pic_down_failed_num += 1
                                    continue

                        # TODO
                        # dbItem是下线状态,可能要重新匹配terms,
                        # 原来下线时并没有删除对应的item_term, 但不排除其他渠道删除,以后有需求再处理
                        #

                        item.db_update(db)
                        off2on_items.append(num_id)
                    else:
                        #add
                        item = TaobaoItem(shop_id, 0, num_id)
                        item.title = tb_title
                        item.detail_url = tb_detail_url.replace(
                            "spm=(\\.|\\d)*", "spm=2014.12669715.0.0")
                        item.price = tb_price
                        item.volume = volume
                        #item.category = tb_category.getCategoryPath(n_cid)      # --->
                        item.termIds = item.matchTaobaoTerms(
                            term_factory, str(shop_termLimits))  # --->
                        item.setPicUrl(tb_pic_url)  # --->
                        item.setCampaign(defaultCampaign)  # --->
                        item.status = ITEM_STATUS_ACTIVE

                        # 图片下载失败的,下次轮询再处理
                        if not item.is_pic_download:
                            pic_down_failed_num += 1
                            continue
                        item.db_create(db)
                        new_items.append(num_id)
                except:
                    logger.error("%s: %s creating failed %s", shop_id, num_id,
                                 traceback.format_exc())
                    continue
    logger.info(
        "shop %s crawler: new %s, back on line %s, black %s, pic download failed %s",
        shop_id, len(new_items), len(off2on_items), black_num,
        pic_down_failed_num)

    if offShelf_numids_set:
        #offline
        db.execute("update item set status=2 where num_id in (%s)" %
                   ', '.join("'" + str(s) + "'" for s in offShelf_numids_set))
    logger.info("shop %s crawler: offline %s", shop_id,
                len(offShelf_numids_set))

    db.execute("update shop set crawl_status=%s where id=%s",
               SHOP_CRAWL_STATUS_NONE, shop_id)

    # 以下操作是供统计使用,type=0:新增,1:下架;2:上架
    if len(new_items) > 0:
        for num_id in new_items:
            db.execute(
                "INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())",
                num_id, 0)
    if len(off2on_items) > 0:
        for num_id in off2on_items:
            db.execute(
                "INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())",
                num_id, 2)
    if len(offShelf_numids_set) > 0:
        for num_id in offShelf_numids_set:
            db.execute(
                "INSERT INTO item_status_record (num_id,type,create_time) VALUES (%s, %s, now())",
                num_id, 1)