Пример #1
def crawl_one_shop(shop, tb_category, term_factory, db):
    shop_id = shop[0]
    shop_type = shop[1]
    shop_url = shop[2]
    shop_termLimits = shop[3]

    # 白名单模式暂时没有使用上,shop.mode

    defaultCampaign = list(db.execute(
        "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0" % shop_id))
    if not defaultCampaign:
        logger.error("can not get the default campaign for shop: %s", shop_id)

        1.setting shop crawl_status=2
        3.setting shop crawl_status=0
    db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_CRAWLING, shop_id)

    # 店铺的所有商品num id,从新品列表抓取获得,这里注意:可能有多条记录
    allTbNumIds = list(db.execute("SELECT itemids FROM tb_shop_item WHERE shopid = %s", shop_id))
    tb_numids = []
    for ids in allTbNumIds:
    tb_numids_set = set(tb_numids)
    logger.info("crawling shop: %s %s, taobao online num %s", shop_id, shop_url, len(tb_numids_set))

    # 过滤
    new_numids_set, offShelf_numids_set, common_numids_set = filterNumIds(db, shop_id, tb_numids_set)
    logger.info("stat taobao shop %s: new_num:%s, offline_num:%s, common_num:%s" % (shop_id, len(new_numids_set), len(offShelf_numids_set), len(common_numids_set)))

    new_num = 0
    off2On_num = 0
    black_num = 0
    if len(new_numids_set) > 0:
        new_item_list = doCrawl(shop_id, new_numids_set)
        if new_item_list:
            for dict_item in new_item_list:
                num_id = str(dict_item['num_iid'])
                n_cid = dict_item['cid']
                tb_title = dict_item['title']
                tb_detail_url = str(dict_item['detail_url'])
                tb_price = float(dict_item['price'])
                tb_pic_url = str(dict_item['pic_url'])

                volume = 0
                if dict_item.has_key('volume'):
                    volume = dict_item['volume']
                    db_item = list(db.execute(
                        "select id, title, pic_url, local_pic_url, price, manual_set, status from item where shop_id=%s and num_id='%s'" % (shop_id, num_id)))
                    if db_item:
                        db_status = int(db_item[0][6])
                        db_manual_set = int(db_item[0][5])
                        db_price = float(db_item[0][4])
                        db_local_pic_url = db_item[0][3]
                        db_pic_url = db_item[0][2]
                        db_title = db_item[0][1]
                        db_item_id = int(db_item[0][0])

                        if db_status == ITEM_STATUS_BLACKLIST:
                            black_num += 1

                        item = TaobaoItem(shop_id, db_item_id, num_id)
                        item.status = ITEM_STATUS_ACTIVE     # 先置为上线状态,再检查其他属性是否有变化

                        # 人工设置了图片和title
                        if db_manual_set == 1:
                            if not imgExists(shop_id, db_local_pic_url):
                                # 图片不存在,需要重新下载,且检查价格
                                item.local_pic_url = db_local_pic_url
                                if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                                    item.price = tb_price
                            # 图片存在,只检查价格
                                if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                                    item.price = tb_price
                            if tb_title != db_title:
                                item.title = tb_title
                            if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                                item.price = tb_price
                                # 图片路径有变化,或者原图片不存在了,都需要重新下载
                            if tb_pic_url != db_pic_url or not imgExists(shop_id, db_local_pic_url):
                                item.local_pic_url = db_local_pic_url

                        # TODO
                        # dbItem是下线状态,可能要重新匹配terms,
                        # 原来下线时并没有删除对应的item_term, 但不排除其他渠道删除,以后有需求再处理

                        off2On_num += 1
                        item = TaobaoItem(shop_id, 0, num_id)
                        item.title = tb_title
                        item.detail_url = tb_detail_url.replace("spm=(\\.|\\d)*", "spm=2014.12669715.0.0")
                        item.price = tb_price
                        item.pic_url = tb_pic_url
                        item.volume = volume
                        item.category = tb_category.getCategoryPath(n_cid)      # --->
                        item.termIds = item.matchTaobaoTerms(term_factory, str(shop_termLimits))    # --->
                        item.status = ITEM_STATUS_ACTIVE

                        new_num += 1
                    logger.error("%s: %s creating failed %s", shop_id, num_id, traceback.format_exc())
    logger.info("shop %s crawler: new %s, back on line %s, black %s", shop_id, new_num, off2On_num, black_num)

    if offShelf_numids_set:
        db.execute("update item set status=2 where num_id in (%s)" % ', '.join("'" + str(s) + "'" for s in offShelf_numids_set))
    logger.info("shop %s crawler: offline %s", shop_id, len(offShelf_numids_set))

    # 原有的逻辑中,是将已经抓取过的item过滤掉,不进行处理。
    # 如果想更新title/price/pic_url速度更块一些的话,可以打开此部分代码,可保证至少4小时内全部更新一遍
    update_num = 0
    if common_numids_set:
        #validate price pic_url
        common_item_list = doCrawl(shop_id, common_numids_set)
        if common_item_list:
            for dict_item in common_item_list:
                num_id = str(dict_item['num_iid'])
                tb_title = dict_item['title']
                tb_price = float(dict_item['price'])
                tb_pic_url = str(dict_item['pic_url'])
                db_item = list(db.execute("select id, title, pic_url, local_pic_url, price, manual_set, volume from item where shop_id=%s and num_id=%s and status = 1" % (shop_id, num_id)))
                if db_item:
                    db_volume = int(db_item[0][6])
                    db_manual_set = int(db_item[0][5])
                    db_price = float(db_item[0][4])
                    db_local_pic_url = db_item[0][3]
                    db_pic_url = db_item[0][2]
                    db_title = db_item[0][1]
                    db_item_id = db_item[0][0]

                    item = TaobaoItem(shop_id, db_item_id, num_id)
                    is_update = False
                    if db_manual_set == 1:
                        if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                            item.price = tb_price
                            is_update = True
                        if dict_item.has_key('volume'):
                            if int(dict_item['volume']) != db_volume:
                                item.volume = int(dict_item['volume'])
                                is_update = True
                        if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                            item.price = tb_price
                            is_update = True
                        if tb_title != db_title:
                            item.title = tb_title
                            is_update = True
                        if tb_pic_url != db_pic_url or not imgExists(shop_id, db_local_pic_url):
                            item.local_pic_url = db_local_pic_url
                            is_update = True

                    if is_update:
                        update_num += 1

        logger.info("shop %s: common %s, update %s ", shop_id, len(common_numids_set), update_num)

    db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_NONE, shop_id)
Пример #2
def crawl_one_shop(shop, tb_category, term_factory, db):
    shop_id = shop[0]
    shop_type = shop[1]
    shop_url = shop[2]
    shop_termLimits = shop[3]

    # 白名单模式暂时没有使用上,shop.mode

    defaultCampaign = list(db.execute(
        "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0" % shop_id))
    if not defaultCampaign:
        logger.error("can not get the default campaign for shop: %s", shop_id)

        1.setting shop crawl_status=2
        3.setting shop crawl_status=0
    db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_CRAWLING, shop_id)

    # 店铺的所有商品num id,从新品列表抓取获得,这里注意:可能有多条记录
    allTbNumIds = list(db.execute("SELECT itemids FROM tb_shop_item WHERE shopid = %s", shop_id))
    tb_numids = []
    for ids in allTbNumIds:
    tb_numids_set = set(tb_numids)
    logger.info("crawling shop: %s %s, taobao online num %s", shop_id, shop_url, len(tb_numids_set))

    # 过滤
    new_numids_set, offShelf_numids_set, common_numids_set = filterNumIds(db, shop_id, tb_numids_set)
    logger.info("stat taobao shop %s: new_num:%s, offline_num:%s, common_num:%s" % (shop_id, len(new_numids_set), len(offShelf_numids_set), len(common_numids_set)))

    new_num = 0
    off2On_num = 0
    black_num = 0
    if len(new_numids_set) > 0:
        new_item_list = doCrawl(shop_id, new_numids_set)
        if new_item_list:
            for dict_item in new_item_list:
                num_id = str(dict_item['num_iid'])
                n_cid = dict_item['cid']
                tb_title = dict_item['title']
                tb_detail_url = str(dict_item['detail_url'])
                tb_price = float(dict_item['price'])
                tb_pic_url = str(dict_item['pic_url'])

                volume = 0
                if dict_item.has_key('volume'):
                    volume = dict_item['volume']
                    db_item = list(db.execute(
                        "select id, title, pic_url, local_pic_url, price, manual_set, status from item where shop_id=%s and num_id='%s'" % (shop_id, num_id)))
                    if db_item:
                        db_status = int(db_item[0][6])
                        db_manual_set = int(db_item[0][5])
                        db_price = float(db_item[0][4])
                        db_local_pic_url = db_item[0][3]
                        db_pic_url = db_item[0][2]
                        db_title = db_item[0][1]
                        db_item_id = int(db_item[0][0])

                        if db_status == ITEM_STATUS_BLACKLIST:
                            black_num += 1

                        item = TaobaoItem(shop_id, db_item_id, num_id)
                        item.status = ITEM_STATUS_ACTIVE     # 先置为上线状态,再检查其他属性是否有变化

                        # 人工设置了图片和title
                        if db_manual_set == 1:
                            if not imgExists(shop_id, db_local_pic_url):
                                # 图片不存在,需要重新下载,且检查价格
                                item.local_pic_url = db_local_pic_url
                                if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                                    item.price = tb_price
                            # 图片存在,只检查价格
                                if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                                    item.price = tb_price
                            if tb_title != db_title:
                                item.title = tb_title
                            if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                                item.price = tb_price
                                # 图片路径有变化,或者原图片不存在了,都需要重新下载
                            if tb_pic_url != db_pic_url or not imgExists(shop_id, db_local_pic_url):
                                item.local_pic_url = db_local_pic_url

                        # TODO
                        # dbItem是下线状态,可能要重新匹配terms,
                        # 原来下线时并没有删除对应的item_term, 但不排除其他渠道删除,以后有需求再处理

                        off2On_num += 1
                        item = TaobaoItem(shop_id, 0, num_id)
                        item.title = tb_title
                        item.detail_url = tb_detail_url.replace("spm=(\\.|\\d)*", "spm=2014.12669715.0.0")
                        item.price = tb_price
                        item.pic_url = tb_pic_url
                        item.volume = volume
                        item.category = tb_category.getCategoryPath(n_cid)      # --->
                        item.termIds = item.matchTaobaoTerms(term_factory, str(shop_termLimits))    # --->
                        item.status = ITEM_STATUS_ACTIVE

                        new_num += 1
                    logger.error("%s: %s creating failed %s", shop_id, num_id, traceback.format_exc())
    logger.info("shop %s crawler: new %s, back on line %s, black %s", shop_id, new_num, off2On_num, black_num)

    if offShelf_numids_set:
        db.execute("update item set status=2 where num_id in (%s)" % ', '.join("'" + str(s) + "'" for s in offShelf_numids_set))
    logger.info("shop %s crawler: offline %s", shop_id, len(offShelf_numids_set))

    # 原有的逻辑中,是将已经抓取过的item过滤掉,不进行处理。
    # 如果想更新title/price/pic_url速度更块一些的话,可以打开此部分代码,可保证至少4小时内全部更新一遍
    update_num = 0
    if common_numids_set:
        #validate price pic_url
        common_item_list = doCrawl(shop_id, common_numids_set)
        if common_item_list:
            for dict_item in common_item_list:
                num_id = str(dict_item['num_iid'])
                tb_title = dict_item['title']
                tb_price = float(dict_item['price'])
                tb_pic_url = str(dict_item['pic_url'])
                db_item = list(db.execute("select id, title, pic_url, local_pic_url, price, manual_set, volume from item where shop_id=%s and num_id=%s and status = 1" % (shop_id, num_id)))
                if db_item:
                    db_volume = int(db_item[0][6])
                    db_manual_set = int(db_item[0][5])
                    db_price = float(db_item[0][4])
                    db_local_pic_url = db_item[0][3]
                    db_pic_url = db_item[0][2]
                    db_title = db_item[0][1]
                    db_item_id = db_item[0][0]

                    item = TaobaoItem(shop_id, db_item_id, num_id)
                    is_update = False
                    if db_manual_set == 1:
                        if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                            item.price = tb_price
                            is_update = True
                        if dict_item.has_key('volume'):
                            if int(dict_item['volume']) != db_volume:
                                item.volume = int(dict_item['volume'])
                                is_update = True
                        if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                            item.price = tb_price
                            is_update = True
                        if tb_title != db_title:
                            item.title = tb_title
                            is_update = True
                        if tb_pic_url != db_pic_url or not imgExists(shop_id, db_local_pic_url):
                            item.local_pic_url = db_local_pic_url
                            is_update = True

                    if is_update:
                        update_num += 1

        logger.info("shop %s: common %s, update %s ", shop_id, len(common_numids_set), update_num)

    db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_NONE, shop_id)
Пример #3
def crawl_shop(sql):
    db = get_db_engine()
    shops = db.execute(sql)

    # debug
    if FLAGS.debug_parser:
        import pdb

    # global, all shop use
    tb_category = TaobaoCategory()
    term_factory = TermFactory()

    for shop in shops:
        shop_id = shop[0]
        shop_type = shop[1]
        shop_url = shop[2]
        shop_termLimits = shop[3]

        defaultCampaign = list(
                "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0"
                % shop_id))
        if not defaultCampaign:
            logger.error("can not get the default campaign for shop: %s",

        #1.更新shop crawl_status=2
        #3.更新shop crawl_status=0
        #db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_CRAWLING, shop_id)

        # 店铺的所有商品num id
        allTbNumIds = db.execute(
            "SELECT itemids FROM tb_shop_item WHERE shopid = %s", shop_id)
        tb_numids = []
        for ids in allTbNumIds:
        tb_numids_set = set(tb_numids)

        # 过滤
        new_numids_set, offShelf_numids_set, common_numids_set = filterNumIds(
            db, shop_id, tb_numids_set)

        if len(new_numids_set) == 0:
            logger.info("crawling shop: %s %s", shop_id, shop_url)

            new_item_list = doCrawl(shop_id, new_numids_set)
            if new_item_list:
                new_num = 0
                new_numiid = []
                update_num = 0
                update_numiid = []
                for dict_item in new_item_list:
                    num_id = str(dict_item['num_iid'])
                    n_cid = dict_item['cid']
                    tb_title = dict_item['title']
                    tb_detail_url = str(dict_item['detail_url'])
                    tb_price = float(dict_item['price'])
                    tb_pic_url = str(dict_item['pic_url'])

                    volume = 0
                    if dict_item.has_key('volume'):
                        volume = dict_item['volume']

                    db_item = list(
                            "select id, title, pic_url, local_pic_url, price, manual_set, status from item where shop_id=%s and num_id=%s"
                            % (shop_id, str(num_id))))
                    if db_item:
                        db_status = db_item[0][6]
                        db_manual_set = db_item[0][5]
                        db_price = db_item[0][4]
                        db_local_pic_url = db_item[0][3]
                        db_pic_url = db_item[0][2]
                        db_title = db_item[0][1]
                        db_item_id = db_item[0][0]

                        if db_status == ITEM_STATUS_BLACKLIST:
                        if db_manual_set == 1:
                            # 人工设置了图片和title
                            if not imgExists(shop_id, db_local_pic_url):
                                # 老图片不存在,需要重新下载
                                tb_item = TaobaoItem(shop_id, db_item_id,
                                tb_item.setPicUrl(tb_pic_url, db_pic_url)
                                if tb_price != db_price:
                                    tb_item.price = tb_price

                                # TODO db update

                            tb_item = TaobaoItem(shop_id, db_item_id, num_id,
                                                 dict_item['pic_url'], volume)
                            tb_item.setPicUrl(dict_item['pic_url'], db_pic_url)


                            # TODO db update
                        update_num += 1
                        tb_item = TaobaoItem(shop_id, 0, num_id, tb_title,
                                             tb_detail_url, tb_price,
                                             dict_item['pic_url'], volume)
                        tb_item.category = tb_category.getCategoryPath(n_cid)
                        tb_item.termIds = tb_item.matchTaobaoTerms(
                            term_factory, str(shop_termLimits))
                        tb_item.setPicUrl(dict_item['pic_url'], "")


                        # TODO db add

                        new_num += 1

                logger.info("shop %s new item num=%s,update item num=%s",
                            shop_id, new_num, update_num)

        if offShelf_numids_set:
            #db.execute("update item set status=2 where num_id in (%s)", ', '.join(offShelf_numids_set))
            logger.info("shop %s off shelf item num=%s", shop_id,

        if common_numids_set:
            #validate price pic_url
            common_item_list = doCrawl(shop_id, common_numids_set)
            if common_item_list:
                for dict_item in common_item_list:
                    num_id = str(dict_item['num_iid'])
                    title = dict_item['title']
                    price = float(dict_item['price'])
                    pic_url = str(dict_item['pic_url'])
                    db_item = list(
                            "select id, title, pic_url, local_pic_url, price, manual_set, status from item where shop_id=%s and num_id=%s"
                            % (shop_id, num_id)))
                    if db_item:
                        if db_item[0][6] == ITEM_STATUS_BLACKLIST:
                            id = int(db_item[0][0])

                            tb_item = TaobaoItem(shop_id, id, num_id)
                            if dict_item.has_key('volume'):
                                tb_item.volume = int(dict_item['volume'])
                            if price != db_item[0][4]:
                                tb_item.price = price
                            if title != db_item[0][1]:
                                tb_item.title = title
                            if pic_url != db_item[0][2]:
                                tb_item.setPicUrl(pic_url, db_item[0][2])

                        # TODO db update

            logger.info("shop %s common item num=%s ", shop_id,
Пример #4
def crawl_shop(sql):
    db = get_db_engine()
    shops = db.execute(sql)

    # debug
    if FLAGS.debug_parser:
        import pdb

    # global, all shop use
    tb_category = TaobaoCategory()
    term_factory = TermFactory()

    for shop in shops:
        shop_id = shop[0]
        shop_type = shop[1]
        shop_url = shop[2]
        shop_termLimits = shop[3]

        defaultCampaign = list(db.execute(
            "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0" % shop_id))
        if not defaultCampaign:
            logger.error("can not get the default campaign for shop: %s", shop_id)

        #1.更新shop crawl_status=2
        #3.更新shop crawl_status=0
        #db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_CRAWLING, shop_id)

        # 店铺的所有商品num id
        allTbNumIds = db.execute("SELECT itemids FROM tb_shop_item WHERE shopid = %s", shop_id)
        tb_numids = []
        for ids in allTbNumIds:
        tb_numids_set = set(tb_numids)

        # 过滤
        new_numids_set, offShelf_numids_set, common_numids_set = filterNumIds(db, shop_id, tb_numids_set)

        if len(new_numids_set) == 0:
            logger.info("crawling shop: %s %s", shop_id, shop_url)

            new_item_list = doCrawl(shop_id, new_numids_set)
            if new_item_list:
                new_num = 0
                new_numiid = []
                update_num = 0
                update_numiid = []
                for dict_item in new_item_list:
                    num_id = str(dict_item['num_iid'])
                    n_cid = dict_item['cid']
                    tb_title = dict_item['title']
                    tb_detail_url = str(dict_item['detail_url'])
                    tb_price = float(dict_item['price'])
                    tb_pic_url = str(dict_item['pic_url'])

                    volume = 0
                    if dict_item.has_key('volume'):
                        volume = dict_item['volume']

                    db_item = list(db.execute(
                        "select id, title, pic_url, local_pic_url, price, manual_set, status from item where shop_id=%s and num_id=%s" % (
                        shop_id, str(num_id))))
                    if db_item:
                        db_status = db_item[0][6]
                        db_manual_set = db_item[0][5]
                        db_price = db_item[0][4]
                        db_local_pic_url = db_item[0][3]
                        db_pic_url = db_item[0][2]
                        db_title = db_item[0][1]
                        db_item_id = db_item[0][0]

                        if db_status == ITEM_STATUS_BLACKLIST:
                        if db_manual_set == 1:
                            # 人工设置了图片和title
                            if not imgExists(shop_id, db_local_pic_url):
                                # 老图片不存在,需要重新下载
                                tb_item = TaobaoItem(shop_id, db_item_id, num_id)
                                tb_item.setPicUrl(tb_pic_url, db_pic_url)
                                if tb_price != db_price:
                                    tb_item.price = tb_price

                                # TODO db update

                            tb_item = TaobaoItem(shop_id, db_item_id, num_id, dict_item['title'],
                                                 dict_item['price'], dict_item['pic_url'], volume)
                            tb_item.setPicUrl(dict_item['pic_url'], db_pic_url)


                            # TODO db update
                        update_num += 1
                        tb_item = TaobaoItem(shop_id, 0, num_id, tb_title, tb_detail_url,
                                             tb_price, dict_item['pic_url'], volume)
                        tb_item.category = tb_category.getCategoryPath(n_cid)
                        tb_item.termIds = tb_item.matchTaobaoTerms(term_factory, str(shop_termLimits))
                        tb_item.setPicUrl(dict_item['pic_url'], "")


                        # TODO db add

                        new_num += 1

                logger.info("shop %s new item num=%s,update item num=%s", shop_id, new_num, update_num)

        if offShelf_numids_set:
            #db.execute("update item set status=2 where num_id in (%s)", ', '.join(offShelf_numids_set))
            logger.info("shop %s off shelf item num=%s", shop_id, len(offShelf_numids_set))

        if common_numids_set:
            #validate price pic_url
            common_item_list = doCrawl(shop_id, common_numids_set)
            if common_item_list:
                for dict_item in common_item_list:
                    num_id = str(dict_item['num_iid'])
                    title = dict_item['title']
                    price = float(dict_item['price'])
                    pic_url = str(dict_item['pic_url'])
                    db_item = list(db.execute(
                        "select id, title, pic_url, local_pic_url, price, manual_set, status from item where shop_id=%s and num_id=%s" % (
                        shop_id, num_id)))
                    if db_item:
                        if db_item[0][6] == ITEM_STATUS_BLACKLIST:
                            id = int(db_item[0][0])

                            tb_item = TaobaoItem(shop_id, id, num_id)
                            if dict_item.has_key('volume'):
                                tb_item.volume = int(dict_item['volume'])
                            if price != db_item[0][4]:
                                tb_item.price = price
                            if title != db_item[0][1]:
                                tb_item.title = title
                            if pic_url != db_item[0][2]:
                                tb_item.setPicUrl(pic_url, db_item[0][2])

                        # TODO db update

            logger.info("shop %s common item num=%s ", shop_id, len(common_numids_set))