def update_taobao_volume(db, shop_id, shop_type, total_items):
    # fetch current volumes & price
    logger.info("fetching current volumes %s", shop_id)
    do_query(db, "select num_id, volume, price from item where shop_id=%s" % shop_id)
    results = db.store_result()
    iid_volumes = {}
    for row in results.fetch_row(maxrows=0):
        iid_volumes[row[0]] = row[1]

    # update taobao volume, not tmall
    if FLAGS.updatevolume and shop_type == 1:
        db.autocommit(False)
        db.query("set autocommit=0;")
        # update volume
        logger.info("updating item volume %s", shop_id)
        for item in total_items:
            new_value = item['sales_amount']
            old_value = iid_volumes.get(item['iid'], 0) or 0
            diff_v = abs(new_value - old_value)
            if not iid_volumes.has_key(item['iid']):
                continue
                # 10% or 20 changed, update
            if new_value > 0 and new_value != old_value and (
                            old_value == 0 or diff_v > 20 or diff_v * 1.0 / old_value > 0.1):
                logger.debug("updating item %s %s %s -> %s", shop_id, item['iid'], old_value, new_value)
                do_query(db,
                         "update item set volume=%s where num_id=%s and shop_id=%s" % (new_value, item['iid'], shop_id))
                Statsd.increment('guang.crawl.volume_update_onlist')
        db.commit()

    # TODO: update tmall total volumes
    if FLAGS.updatevolume and shop_type == 2:
        pass
示例#2
0
def convert_taobaoke_widget(items, fn_join_iids=lambda x:','.join(x), batch_size=40, calllimit=60, expire=600, outer_code='jcn', appkey=TAOBAOKE_APPKEY, appsec=TAOBAOKE_APPSECRET):
    ts = int(time.time()*1000)
    msg = appsec + 'app_key' + str(appkey) + "timestamp" + str(ts) + appsec
    sign = hmac.HMAC(appsec, msg).hexdigest().upper()
    headers = {'User-Agent' : DEFAULT_UA, 'Referer' : "http://www.j.cn/"}
    for chunk in waitlimit(calllimit, 60.0, chunks(items, batch_size)): # calllimit for minutes
        params = {'app_key' : appkey,
                  '_t_sys' : 'args=4',
                  'method' : 'taobao.taobaoke.widget.items.convert',
                  'sign' : sign,
                  'timestamp' : ts,
                  'fields' : "num_iid,nick,price,click_url,commission,commission_rate,commission_num,commission_volume,shop_click_url,seller_credit_score",
                  'callback' : 'TOP.io.jsonpCbs.t%s' % md5( str(random.random()) ).hexdigest()[:13],
                  'partner_id' : 'top-sdk-js-20120801',
        }
        params['num_iids'] = fn_join_iids(chunk)
        if outer_code:
            params['outer_code'] = outer_code
        url = "http://gw.api.taobao.com/widget/rest?%s" % urllib.urlencode(params)
        results = download(url, headers=headers)
        if results:
            Statsd.increment('guang.taobaoapi.widget_succ')
        else:
            Statsd.increment('guang.taobaoapi.widget_err')
        #logger.debug('Calling %s(%s) -> %s', request.method_name, request.api_params, results)
        yield (chunk, results)
def crawl_one_shop(shop, failed):
    try:
        is_commit = shop['is_commit']
        shop_id = shop['shop'][0]
        shop_url = shop['shop'][1]
        shop_type = shop['shop'][4]
        shop_nick = shop['shop'][5]

        tb = TaobaoListHtml(shop_id, shop_url)
        tb.crawl()
        logger.debug("crawl result %s count %s total %s", tb.id, tb.count, len(tb.total_items))

        if is_commit:
            batch_size=100
            total_items = tb.total_items

            db = get_rawdb_conn()
            update_shop_items(batch_size, db, shop_id, total_items)
            update_taobao_volume(db, shop_id, shop_type, total_items)
            db.close()

            Statsd.increment('guang.crawl.shop_list_succ')
    except ShopOfflineException:
        #double check shop status by taobao api
        shopinfo = get_taobao_shops(get_rand_top(), shop_nick)
        if shopinfo.get('error', 0) == 560 and is_commit:
            db = get_rawdb_conn()
            do_query(db, "update shop set status=2 where id=%s" % shop_id)
            db.commit()
            db.close()
    except:
        Statsd.increment('guang.crawl.shop_list_failed')
        logger.error("crawl shop failed %s %s", shop_id, traceback.format_exc(), extra={'tags':['crawlShopException',]})
        failed.append({'shopid':shop_id, 'err':traceback.format_exc()})
示例#4
0
def post_one(db, user, select_sql, update_succ_sql, update_fail_sql, table_prefix):
    now = datetime.datetime.now()
    next = datetime.datetime(2020, 1, 1)
    total = 0
    succ = 0
    failed = 0
    skip = 0

    logger.debug("querying %s", select_sql)
    results = db.execute(select_sql)
    logger.debug("processing post for user %s, total %s, sql %s", user, results.rowcount, select_sql)
    for item in results:
        if FLAGS.postinterval:
            time.sleep(FLAGS.postinterval)
        total += 1
        logger.debug("processing post %s/%s/%s/%s for user %s", succ, skip, total, results.rowcount, user)
        post_id = item[0]
        filename = item[2]
        post_ts = item[3]
        qqid = item[4]
        cookie_file = item[5]
        sid = item[6]
        #grplevel = item[7]
        content = preprocess_content(item[1], sid, post_ts, table_prefix, post_id)
        # reselect cookie
        cookie_result = list(db.execute("select cookies from wb_qq_account where qqid=%s" % qqid))
        if cookie_result and cookie_result[0][0] != cookie_file:
            cookie_file = cookie_result[0][0]
        if post_ts <= now and cookie_file:
            logger.info("Preparing posting %s/%s %s qq %s sid %s %s %s @ %s", total, results.rowcount, post_id, qqid, sid, content.encode('utf8'), filename.encode('utf8'), post_ts)
            result = post_shuoshuo_string(cookie_file, filename, content, sid=sid, post_id=post_id)
            if not FLAGS.dryrun:
                if result:
                    succ += 1
                    db.execute(update_succ_sql % post_id)
                    Statsd.increment('guang.qzonepost.succ')
                elif FLAGS.commitfail:
                    failed += 1
                    db.execute(update_fail_sql % post_id)
                    Statsd.increment('guang.qzonepost.fail')
                    log_paperboy("post timeout xks %s post_id %s" % (sid, post_id))
        else:
            if FLAGS.timer:
                logger.info("Preparing posting timer %s/%s %s qq %s sid %s %s %s @ %s", total, results.rowcount, post_id, qqid, sid, content.encode('utf8'), filename.encode('utf8'), post_ts)
                result = post_shuoshuo_string(cookie_file, filename, content, sid=sid, schedule_ts=int(time.mktime(post_ts.timetuple())), post_id=post_id)
                if not FLAGS.dryrun:
                    if result:
                        succ += 1
                        db.execute(update_succ_sql % post_id)
                        Statsd.increment('guang.qzonepost.succ')
                    else:
                        skip += 1
                        next = min(post_ts, next)
                        Statsd.increment('guang.qzonepost.timerfail')
            else:
                skip += 1
                next = min(post_ts, next)
                logger.debug("Skiping post %s %s, scheduled @ %s", content.encode('utf8'), filename.encode('utf8'), post_ts)
    if total > 0:
        logger.info("Batch result total %s skip %s succ %s failed %s next schedule %s", total, skip, succ, failed, next)
示例#5
0
def crawl_one_shop(shop, failed):
    try:
        is_commit = shop['is_commit']
        shop_id = shop['shop'][0]
        shop_url = shop['shop'][1]
        shop_type = shop['shop'][4]
        shop_nick = shop['shop'][5].encode('utf-8')

        tb = TaobaoListHtml(shop_id, shop_url)
        tb.crawl()
        logger.debug("crawl result %s count %s total %s", tb.id, tb.count,
                     len(tb.total_items))

        if is_commit:
            batch_size = 100
            total_items = tb.total_items

            db = get_rawdb_conn()
            update_shop_items(batch_size, db, shop_id, total_items)
            update_taobao_volume(db, shop_id, shop_type, total_items)
            db.close()

            Statsd.increment('guang.crawl.shop_list_succ')
    except ShopOfflineException:
        #double check shop status by taobao api
        shopinfo = get_taobao_shops(get_rand_top(), [shop_nick])
        if not shopinfo and is_commit:
            """
            db = get_rawdb_conn()
            do_query(db, "update shop set status=2 where id=%s" % shop_id)
            db.commit()
            db.close()
            """
            logger.warning("Shop %s: %s not is taobaoke", shop_id, shop_url)
        else:
            logger.error("Shop %s: %s url is error!", shop_id, shop_url)
    except:
        Statsd.increment('guang.crawl.shop_list_failed')
        logger.error("crawl shop failed %s %s",
                     shop_id,
                     traceback.format_exc(),
                     extra={'tags': [
                         'crawlShopException',
                     ]})
        failed.append({'shopid': shop_id, 'err': traceback.format_exc()})
示例#6
0
def convert_taobaoke_widget(items,
                            fn_join_iids=lambda x: ','.join(x),
                            batch_size=40,
                            calllimit=60,
                            expire=600,
                            outer_code='jcn',
                            appkey=TAOBAOKE_APPKEY,
                            appsec=TAOBAOKE_APPSECRET):
    ts = int(time.time() * 1000)
    msg = appsec + 'app_key' + str(appkey) + "timestamp" + str(ts) + appsec
    sign = hmac.HMAC(appsec, msg).hexdigest().upper()
    headers = {'User-Agent': DEFAULT_UA, 'Referer': "http://www.j.cn/"}
    for chunk in waitlimit(calllimit, 60.0,
                           chunks(items, batch_size)):  # calllimit for minutes
        params = {
            'app_key':
            appkey,
            '_t_sys':
            'args=4',
            'method':
            'taobao.taobaoke.widget.items.convert',
            'sign':
            sign,
            'timestamp':
            ts,
            'fields':
            "num_iid,nick,price,click_url,commission,commission_rate,commission_num,commission_volume,shop_click_url,seller_credit_score",
            'callback':
            'TOP.io.jsonpCbs.t%s' % md5(str(random.random())).hexdigest()[:13],
            'partner_id':
            'top-sdk-js-20120801',
        }
        params['num_iids'] = fn_join_iids(chunk)
        if outer_code:
            params['outer_code'] = outer_code
        url = "http://gw.api.taobao.com/widget/rest?%s" % urllib.urlencode(
            params)
        results = download(url, headers=headers)
        if results:
            Statsd.increment('guang.taobaoapi.widget_succ')
        else:
            Statsd.increment('guang.taobaoapi.widget_err')
        #logger.debug('Calling %s(%s) -> %s', request.method_name, request.api_params, results)
        yield (chunk, results)
示例#7
0
def update_taobao_volume(db, shop_id, shop_type, total_items):
    # fetch current volumes & price
    logger.info("fetching current volumes %s", shop_id)
    do_query(
        db,
        "select num_id, volume, price from item where shop_id=%s" % shop_id)
    results = db.store_result()
    iid_volumes = {}
    for row in results.fetch_row(maxrows=0):
        iid_volumes[row[0]] = row[1]

    # update taobao volume, not tmall
    if FLAGS.updatevolume and shop_type == 1:
        db.autocommit(False)
        db.query("set autocommit=0;")
        # update volume
        logger.info("updating item volume %s", shop_id)
        for item in total_items:
            new_value = item['sales_amount']
            old_value = iid_volumes.get(item['iid'], 0) or 0
            diff_v = abs(new_value - old_value)
            if not iid_volumes.has_key(item['iid']):
                continue
                # 10% or 20 changed, update
            if new_value > 0 and new_value != old_value and (
                    old_value == 0 or diff_v > 20
                    or diff_v * 1.0 / old_value > 0.1):
                logger.debug("updating item %s %s %s -> %s", shop_id,
                             item['iid'], old_value, new_value)
                do_query(
                    db,
                    "update item set volume=%s where num_id=%s and shop_id=%s"
                    % (new_value, item['iid'], shop_id))
                Statsd.increment('guang.crawl.volume_update_onlist')
        db.commit()

    # TODO: update tmall total volumes
    if FLAGS.updatevolume and shop_type == 2:
        pass
示例#8
0
def try_execute(top, request, expire=600.0):
    current = time.time()
    interval = 60.0  # wait for 1 min
    ban_retry_count = 0
    http_retry_count = 0
    http_max_retry = 3
    while True:
        try:
            if FLAGS.debug_topapi:
                import pdb
                pdb.set_trace()
            result = top.execute(request)
            Statsd.increment('guang.taobaoapi.%s.succ' %
                             request.method_name.replace('.', '_'))
            logger.debug("calling %s(%s) --> %s", request.method_name,
                         request.api_params, result)
            return result
        except requests.exceptions.ConnectionError, e1:
            logger.warn("Call api http failed %s", traceback.format_exc())
            Statsd.increment('guang.taobaoapi.%s.conn_err' %
                             request.method_name.replace('.', '_'))
            http_retry_count += 1
            if http_retry_count > http_max_retry:
                return None
            else:
                time.sleep(interval)
        except TOPException, e:
            logger.warn("Call api top failed %s", traceback.format_exc())
            Statsd.increment('guang.taobaoapi.%s.api_err' %
                             request.method_name.replace('.', '_'))
            if e.code in [4, 5, 6, 7,
                          8]:  #  This ban will last for 71 more seconds
                m = R_ERROR_MSG.match(e.message)  # e.args[0]
                if m:
                    try:
                        interval = int(m.group(1)) + 10.0
                    except:
                        interval = 60.0
                if ban_retry_count > 0:
                    interval += 60.0 * ban_retry_count
                ban_retry_count += 1
                logger.info("Waiting and try after %s", interval)
                time.sleep(interval)
                if time.time() - current > expire:
                    logger.error("call %s timeout %s" %
                                 (request.method_name, time.time() - current))
                    return None
            elif e.code == 560:  # 查询不到对应的用户信息 (code=560)
                return {'error': 560}
            else:
                return None
示例#9
0
def try_execute(top, request, expire=600.0):
    current = time.time()
    interval = 60.0 # wait for 1 min
    ban_retry_count = 0
    http_retry_count = 0
    http_max_retry = 3
    while True:
        try:
            if FLAGS.debug_topapi:
                import pdb; pdb.set_trace()
            result = top.execute(request)
            Statsd.increment('guang.taobaoapi.%s.succ' % request.method_name.replace('.', '_'))
            logger.debug("calling %s(%s) --> %s", request.method_name, request.api_params, result)
            return result
        except requests.exceptions.ConnectionError, e1:
            logger.warn("Call api http failed %s", traceback.format_exc())
            Statsd.increment('guang.taobaoapi.%s.conn_err' % request.method_name.replace('.', '_'))
            http_retry_count += 1
            if http_retry_count > http_max_retry:
                return None
            else:
                time.sleep(interval)
        except TOPException, e:
            logger.warn("Call api top failed %s", traceback.format_exc())
            Statsd.increment('guang.taobaoapi.%s.api_err' % request.method_name.replace('.', '_'))
            if e.code in [4, 5, 6, 7, 8]: #  This ban will last for 71 more seconds
                m = R_ERROR_MSG.match(e.message) # e.args[0]
                if m:
                    try:
                        interval = int(m.group(1)) + 10.0
                    except:
                        interval = 60.0
                if ban_retry_count > 0:
                    interval += 60.0*ban_retry_count
                ban_retry_count += 1
                logger.info("Waiting and try after %s", interval)
                time.sleep(interval)
                if time.time() - current > expire:
                    logger.error("call %s timeout %s" % (request.method_name, time.time()-current))
                    return None
            elif e.code == 560: # 查询不到对应的用户信息 (code=560)
                return {'error':560}
            else:
                return None
示例#10
0
def crawl_item2(kwargs):
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    is_success = False
    item_id = item[0]
    num_id = item[2]
    crawl_result = ((item_id, (0,0,0,0,0,0.0,0)),)

    tb = TaobaoHtml(item_id, num_id, max_comments=kwargs['max_comments'])

    db = None
    if is_commit:
        db = get_db_engine()

    try:
        logger.info("progress %s/%s id %s iid %s", kwargs['i'], kwargs['total'], item_id, num_id)
        tb.crawl()
        if tb.is_offline and is_commit:
            db.execute("update item set status=2, modified=now() where id=%s" % item_id)
        if tb.detailDiv and not tb.is_offline:
            tb.crawl_price()

            if is_commit:
                # check old price and volume
                pv = list(db.execute("select price, volume from item where id=%s", item_id))
                price = pv[0][0]
                volume = pv[0][1]
                if tb.price != price and tb.price > 0.001:
                    is_price_update = True
                else:
                    is_price_update = False
                if tb.volume > 0 and tb.volume != volume:
                    is_volume_update = True
                else:
                    is_volume_update = False

                if is_price_update:
                    db.execute("insert into price_update_track (item_id,time) values (%s,now()) on duplicate key update time=now()" % item_id)
                    if is_volume_update:
                        db.execute("update item set price=%s, volume=%s where id=%s", tb.price, tb.volume, item_id)
                    else:
                        db.execute("update item set price=%s where id=%s", tb.price, item_id)
                elif is_volume_update:
                    db.execute("update item set volume=%s where id=%s", tb.volume, item_id)
                if is_price_update:
                    Statsd.increment("taobao.crawl.price_update")
                if is_volume_update:
                    Statsd.increment("taobao.crawl.volume_update")

            if FLAGS.update_main:
                tb.crawl_desc()

                if len(tb.thumbImages) > 0 and is_commit and FLAGS.commit_html:
                    db.execute("delete from crawl_html where item_id=%s" % item_id)
                    db.execute("insert into crawl_html (item_id,desc_url,promo_url,html,desc_content,promo_content,result,reason) values (%s, %s, %s, %s, %s, %s, %s, %s)", item_id, tb.descUrl, tb.promoteUrl, tb.data.decode('gb18030').encode('utf8'), tb.descContent.decode('gb18030').encode('utf8'), tb.promoteContent.decode('gb18030').encode('utf8'), 1, "")
                    db.execute("update item set crawl_status=1 where id=%s" % item_id)
                    Statsd.increment("taobao.crawl.html_update")

            ############### processing comments ###########
            if FLAGS.update_comments:
                rediscli = get_redis(FLAGS.redishost, FLAGS.redisport)
                key = "guang:rate:%s" % item_id
                l = rediscli.llen(key)
                tb.crawl_rate()
                logger.info("replace comments %s %s -> %s", item_id, l, len(tb.comments))
                #rediscli.lrange(key, 0, l)
                rediscli.delete(key)
                for c in tb.comments:
                    rediscli.rpush(key, c.SerializeToString())
                    # if limit size
                    #p = rediscli.pipeline()
                    #p.rpush(key, c.SerializeToString())
                    #p.ltrim(0, 99)
                    #p.execute()
                Statsd.increment("taobao.crawl.comments_update")
                Statsd.update_stats("taobao.crawl.comments_update_total", len(tb.comments))

            is_success = True
            crawl_result = ((item_id, (len(tb.data),len(tb.promoteContent),len(tb.descContent),len(tb.thumbImages),len(tb.buyButton),tb.price,len(tb.comments))),)
        else:
            logger.warn("crawl %s failed, no detail content or is_offline=%s", item_id, tb.is_offline)
            crawl_result = ((item_id, (len(tb.data),0,0,0,0,0.0,0)),)
    except:
        logger.error("crawling %s unknown exception %s", item_id, traceback.format_exc(), extra={'tags':['crawlItemException',]})
    logger.info("crawling %s result %s - %s", item_id, is_success, crawl_result)
    if is_success:
        Statsd.increment("taobao.crawl.itemhtml.succ")
    else:
        Statsd.increment("taobao.crawl.itemhtml.failed")
    return crawl_result
示例#11
0
def update_vip_shop(shop_id, db=None):
    if not db:
        db = get_db_engine()

    limitsql = ""
    if FLAGS.limit:
        limitsql += " limit " + str(FLAGS.limit)

    if shop_id:
        shop_str = " shop.id = %s and " % shop_id
    else:
        shop_str = " shop.id in (%s) and " % ','.join(map(str, FLAGS.vipshopids))

    if FLAGS.interval > 0:
        from_date = datetime.datetime.strftime(datetime.datetime.now() - datetime.timedelta(FLAGS.interval), "%Y-%m-%d %H:%M:%S")
        shop_str += " (created > '%s' or modified > '%s') and " % (from_date, from_date)

    sql = "select item.id,item.num_id,shop.type,item.detail_url,item_re.detail_url from shop,item left join item_re on item.id=item_re.item_id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str
    results = db.connect().execute(sql + limitsql)

    total = results.rowcount
    logger.debug("Processing %s result %s", sql, total)
    if total == 0:
        logger.info("nothing to do with shop %s", shop_str)
        return

    pos = 0
    converted = 0
    SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$")
    for input, outputstr in convert_taobaoke_widget(list(filter_tbk_items(results)), fn_join_iids=join_iids, calllimit=60, outer_code=None, appkey='21315963', appsec='549d623e612832df7720101f83f951b9'):
        if not outputstr:
            logger.debug("Converted failed %s null %s progress %s/%s/%s -> in %s" % (input, shop_id, converted, pos, total, len(input)))
            continue
        output = extract_json_from_jsonp(outputstr)
        pos += len(input)
        if not output:
            logger.debug("Converted failed empty %s %s progress %s/%s/%s -> in %s" % (input, shop_id, converted, pos, total, len(input)))
            continue
        if output['total_results'] == 0 or not output['taobaoke_items']:
            logger.debug("No output %s %s %s/%s/%s", input, shop_id, converted, pos, total)
            continue
        succ_len = len(output['taobaoke_items']['taobaoke_item'])
        logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" % (shop_id, converted, pos, total, len(input), output['total_results'], succ_len))
        converted += succ_len
        Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len)
        try:
            numid2id = dict([(int(num_id), id) for id, num_id, shop_type, jn_url, re_url in input])
            for result in output['taobaoke_items']['taobaoke_item']:
                isql = ""
                try:
                    num_iid = result['num_iid']
                    click_url = result['click_url'] + "&u=re_UCTRAC_CLK_&unid=re_UCTRAC_CLK_"
                    # conver spm to xtao
                    if click_url.find('spm=') > 0:
                        click_url = SPMRE.subn(r'\g<1>2014.21315963.1.0\g<3>', click_url)[0]
                    else:
                        click_url += '&spm=2014.21315963.1.0'
                    id = numid2id[num_iid]
                    isql = "insert into item_re (item_id, detail_url) values (%s, '%s') on duplicate key update detail_url='%s'" % (id, click_url, click_url)
                    logger.debug("process %s %s/%s -> %s", shop_id, pos, total, isql)
                    if not FLAGS.dryrun:
                        db.execute(isql.replace('%', '%%'))
                except KeyboardInterrupt:
                    raise
                except Exception, e:
                    logger.debug("in %s out %s" % (numid2id, result))
                    logger.warn("convert failed %s %s" % (isql, traceback.format_exc()))
        except KeyboardInterrupt:
            raise
        except:
            logger.warn("process failed %s %s reason %s" % (input, output, traceback.format_exc()))
    logger.info("Convert result %s - %s", converted, total)

    # retry sql
    results = db.connect().execute(sql + limitsql)
    for row in filter_tbk_items(results):
        id, num_id, shop_type, jn_url, re_url = row
        if not re_url:
            sql = "insert into item_re (item_id, detail_url) values (%s, '%s')" % (id, 'http://item.taobao.com/item.htm?id=%s&spm=2014.21315963.1.0' % num_id)
            db.execute(sql)
示例#12
0
def update_shop(shop_id, db):
    if not db:
        db = get_db_engine()

    tbk = list(db.execute("select * from tbk where shop_id=%s" % shop_id))
    if tbk:
        tbk_pid = str(tbk[0][1])
    else:
        tbk_pid = FLAGS.pid

    limitsql = ""
    if FLAGS.limit:
        limitsql += " limit " + str(FLAGS.limit)

    if shop_id:
        shop_str = " shop.id = %s and " % shop_id
    else:
        shop_str = ""

    if FLAGS.interval > 0:
        from_date = datetime.datetime.strftime(datetime.datetime.now() - datetime.timedelta(FLAGS.interval), "%Y-%m-%d %H:%M:%S")
        shop_str += " (created > '%s' or modified > '%s') and " % (from_date, from_date)

    if not FLAGS.force:
        sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id and item.detail_url not like '%%%%s.click.taobao.com%%%%'" % shop_str
        results = db.connect().execute(sql + limitsql)
    else:
        sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str
        results = db.connect().execute(sql + limitsql)

    total = results.rowcount
    if total == 0:
        logger.info("nothing to do with shop %s", shop_id)
        return

    pos = 0
    converted = 0
    SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$")
    for input, outputstr in convert_taobaoke_widget(list(filter_retry_items(results)), fn_join_iids=join_iids, calllimit=60, outer_code=None):
        if not outputstr:
            logger.debug("Converted failed null %s progress %s/%s/%s -> in %s" % (shop_id, converted, pos, total, len(input)))
            continue
        output = extract_json_from_jsonp(outputstr)
        pos += len(input)
        if not output:
            logger.debug("Converted failed empty %s progress %s/%s/%s -> in %s" % (shop_id, converted, pos, total, len(input)))
            continue
        if output['total_results'] == 0 or not output['taobaoke_items']:
            logger.debug("No output %s %s/%s/%s", shop_id, converted, pos, total)
            for row in input:
                if not FLAGS.dryrun:
                    db.execute("insert into tbk_item_convert(item_id, failed_count, last_time) values(%s, 1, now()) on duplicate key update failed_count=failed_count+1, last_time=now()" % row[0])
            continue
        succ_len = len(output['taobaoke_items']['taobaoke_item'])
        logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" % (shop_id, converted, pos, total, len(input), output['total_results'], succ_len))
        converted += succ_len
        Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len)
        try:
            numid2id = dict([(int(num_id), id) for id, num_id, failed_count, last_time in input])
            for result in output['taobaoke_items']['taobaoke_item']:
                sql = ""
                try:
                    num_iid = result['num_iid']
                    click_url = result['click_url'] + "&u=jn_UCTRAC_CLK_&unid=jn_UCTRAC_CLK_"
                    # conver spm to xtao
                    if click_url.find('spm=') > 0:
                        click_url = SPMRE.subn(r'\g<1>2014.12669715.1.0\g<3>', click_url)[0]
                    else:
                        click_url += '&spm=2014.12669715.1.0'
                    id = numid2id[num_iid]
                    sql = "update item set detail_url='%s' where id=%s" % (click_url, id)
                    logger.debug("process %s %s/%s -> %s", shop_id, pos, total, sql)
                    if not FLAGS.dryrun:
                        db.execute(sql.replace('%', '%%'))
                        db.execute("delete from tbk_item_convert where item_id=%s" % id)
                except KeyboardInterrupt:
                    raise
                except Exception, e:
                    logger.debug("in %s out %s" % (numid2id, result))
                    logger.warn("convert failed %s %s" % (sql, traceback.format_exc()))
        except KeyboardInterrupt:
            raise
        except:
            logger.warn("process failed %s %s reason %s" % (input, output, traceback.format_exc()))
    logger.info("Convert result %s - %s", converted, total)
示例#13
0
def crawl_item2(kwargs):
    #signal.signal(signal.SIGINT, signal.SIG_IGN)
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    crawl_path = kwargs['crawl_path']
    server_path = kwargs['server_path']
    org_server_path = kwargs['org_server_path']
    is_remove = kwargs['is_remove']

    item_id = item[0]
    num_id = item[1]
    is_success = False
    crawl_result = ((item_id, {'suc1': 0, 'count1': 0, 'suc': 0, 'count': 0}),)
    try:
        conn = get_db_engine(**kwargs).connect()
        try:
            items = conn.execute("select html, desc_content from crawl_html where crawl_html.item_id=%s;" % item_id)
            result = list(items)
            if len(result) == 1:
                html = result[0][0]
                desc_content = result[0][1] 
                html_obj = parse_html(html)
                thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src")
                if len(thumbImages) == 0:
                    thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")]
                    # taobao @src to @data-src
                    if not len(thumbImages):
                        thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src")

                if len(thumbImages) == 0:
                    logger.error("crawl item %s %s not found thumb images html size %s", item_id, num_id, len(html), extra={'tags':['crawl_thumb_empty',]})
                    return crawl_result

                r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M|re.S)
                tr = re.compile("(.*)_\d+x\d+\.jpg$")
                tr_new = re.compile("(.+\.(jpg|png|gif))[^$]*.jpg$")
                desc_thumbs = desc_table_thumbs = lazy_desc_thumbs = []
                if desc_content:
                    desc_html = r.subn(r'\2', desc_content)[0]
                    desc_html_obj = parse_html(desc_html)
                    if desc_html_obj is not None:
                        desc_table_thumbs = desc_html_obj.xpath("//table/@background")
                        desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img[not(@data-ks-lazyload)]/@src")
                        lazy_desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img/@data-ks-lazyload")
                else:
                    logger.warn("crawl item %s %s desc content is empty!", item_id, num_id, extra={'tags':['crawl_nodesc',]})

                images = []
                pos = 1
                for url in thumbImages:
                    ori_url = None
                    if tr.match(url):
                        ori_url = tr.sub(r'\1', url)
                    else:
                        if tr_new.match(url):
                            ori_url = tr_new.sub(r'\1', url)
                        else:
                            logger.error("crawl item %s %s thumb image urls can not be parsed!", item_id, num_id, extra={'tags':['crawl_exception',]})

                    images.append((ori_url, pos, 1))
                    pos += 1
                for url in desc_table_thumbs:
                    images.append((url, pos, 2))
                    pos += 1
                for url in desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 2))
                        pos += 1
                for url in lazy_desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 3))
                        pos += 1

                logger.debug("crawling %s %s %s", item_id, num_id, images)
                item_crawler = ItemCrawler(item_id, num_id, crawl_path, server_path, org_server_path, kwargs['statshost'], kwargs['statsport'])
                item_crawler.crawl(images, ((710,10000),), is_commit, conn, is_remove)
                is_success = item_crawler.success
                crawl_result = ((item_id, item_crawler.summary),)
        except Exception, e:
            logger.error("crawl item %s %s got exception %s", item_id, num_id, traceback.format_exc(), extra={'tags':['crawl_exception',]})
        finally:
            conn.close()
        Statsd.update_stats("guang.crawl.downimgcount", crawl_result[0][1]['suc1'] + crawl_result[0][1]['suc'],
            host = kwargs['statshost'], port = kwargs['statsport'])
        if is_success:
            logger.info("crawl item %s %s success %s", item_id, num_id, crawl_result)
            Statsd.increment('guang.crawl.itemimg.succ', host = kwargs['statshost'], port = kwargs['statsport'])
        else:
            logger.warn("crawl item %s %s failed %s", item_id, num_id, crawl_result, extra={'tags':['crawl_failed',]})
            Statsd.increment('guang.crawl.itemimg.failed', host = kwargs['statshost'], port = kwargs['statsport'])
示例#14
0
def update_shop(shop_id, db):
    if not db:
        db = get_db_engine()

    tbk = list(db.execute("select * from tbk where shop_id=%s" % shop_id))
    if tbk:
        tbk_pid = str(tbk[0][1])
    else:
        tbk_pid = FLAGS.pid

    limitsql = ""
    if FLAGS.limit:
        limitsql += " limit " + str(FLAGS.limit)

    if shop_id:
        shop_str = " shop.id = %s and " % shop_id
    else:
        shop_str = ""

    if FLAGS.interval > 0:
        from_date = datetime.datetime.strftime(
            datetime.datetime.now() - datetime.timedelta(FLAGS.interval),
            "%Y-%m-%d %H:%M:%S")
        shop_str += " (created > '%s' or modified > '%s') and " % (from_date,
                                                                   from_date)

    if not FLAGS.force:
        sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id and item.detail_url not like '%%%%s.click.taobao.com%%%%'" % shop_str
        results = db.connect().execute(sql + limitsql)
    else:
        sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str
        results = db.connect().execute(sql + limitsql)

    total = results.rowcount
    if total == 0:
        logger.info("nothing to do with shop %s", shop_id)
        return

    pos = 0
    converted = 0
    SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$")
    for input, outputstr in convert_taobaoke_widget(list(
            filter_retry_items(results)),
                                                    fn_join_iids=join_iids,
                                                    calllimit=60,
                                                    outer_code=None):
        if not outputstr:
            logger.debug(
                "Converted failed null %s progress %s/%s/%s -> in %s" %
                (shop_id, converted, pos, total, len(input)))
            continue
        output = extract_json_from_jsonp(outputstr)
        pos += len(input)
        if not output:
            logger.debug(
                "Converted failed empty %s progress %s/%s/%s -> in %s" %
                (shop_id, converted, pos, total, len(input)))
            continue
        if output['total_results'] == 0 or not output['taobaoke_items']:
            logger.debug("No output %s %s/%s/%s", shop_id, converted, pos,
                         total)
            for row in input:
                if not FLAGS.dryrun:
                    db.execute(
                        "insert into tbk_item_convert(item_id, failed_count, last_time) values(%s, 1, now()) on duplicate key update failed_count=failed_count+1, last_time=now()"
                        % row[0])
            continue
        succ_len = len(output['taobaoke_items']['taobaoke_item'])
        logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" %
                    (shop_id, converted, pos, total, len(input),
                     output['total_results'], succ_len))
        converted += succ_len
        Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len)
        try:
            numid2id = dict([(int(num_id), id)
                             for id, num_id, failed_count, last_time in input])
            for result in output['taobaoke_items']['taobaoke_item']:
                sql = ""
                try:
                    num_iid = result['num_iid']
                    click_url = result[
                        'click_url'] + "&u=jn_UCTRAC_CLK_&unid=jn_UCTRAC_CLK_"
                    # conver spm to xtao
                    if click_url.find('spm=') > 0:
                        click_url = SPMRE.subn(r'\g<1>2014.12669715.1.0\g<3>',
                                               click_url)[0]
                    else:
                        click_url += '&spm=2014.12669715.1.0'
                    id = numid2id[num_iid]
                    sql = "update item set detail_url='%s' where id=%s" % (
                        click_url, id)
                    logger.debug("process %s %s/%s -> %s", shop_id, pos, total,
                                 sql)
                    if not FLAGS.dryrun:
                        db.execute(sql.replace('%', '%%'))
                        db.execute(
                            "delete from tbk_item_convert where item_id=%s" %
                            id)
                except KeyboardInterrupt:
                    raise
                except Exception, e:
                    logger.debug("in %s out %s" % (numid2id, result))
                    logger.warn("convert failed %s %s" %
                                (sql, traceback.format_exc()))
        except KeyboardInterrupt:
            raise
        except:
            logger.warn("process failed %s %s reason %s" %
                        (input, output, traceback.format_exc()))
    logger.info("Convert result %s - %s", converted, total)
示例#15
0
def update_vip_shop(shop_id, db=None):
    if not db:
        db = get_db_engine()

    limitsql = ""
    if FLAGS.limit:
        limitsql += " limit " + str(FLAGS.limit)

    if shop_id:
        shop_str = " shop.id = %s and " % shop_id
    else:
        shop_str = " shop.id in (%s) and " % ','.join(
            map(str, FLAGS.vipshopids))

    if FLAGS.interval > 0:
        from_date = datetime.datetime.strftime(
            datetime.datetime.now() - datetime.timedelta(FLAGS.interval),
            "%Y-%m-%d %H:%M:%S")
        shop_str += " (created > '%s' or modified > '%s') and " % (from_date,
                                                                   from_date)

    sql = "select item.id,item.num_id,shop.type,item.detail_url,item_re.detail_url from shop,item left join item_re on item.id=item_re.item_id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str
    results = db.connect().execute(sql + limitsql)

    total = results.rowcount
    logger.debug("Processing %s result %s", sql, total)
    if total == 0:
        logger.info("nothing to do with shop %s", shop_str)
        return

    pos = 0
    converted = 0
    SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$")
    for input, outputstr in convert_taobaoke_widget(
            list(filter_tbk_items(results)),
            fn_join_iids=join_iids,
            calllimit=60,
            outer_code=None,
            appkey='21315963',
            appsec='549d623e612832df7720101f83f951b9'):
        if not outputstr:
            logger.debug(
                "Converted failed %s null %s progress %s/%s/%s -> in %s" %
                (input, shop_id, converted, pos, total, len(input)))
            continue
        output = extract_json_from_jsonp(outputstr)
        pos += len(input)
        if not output:
            logger.debug(
                "Converted failed empty %s %s progress %s/%s/%s -> in %s" %
                (input, shop_id, converted, pos, total, len(input)))
            continue
        if output['total_results'] == 0 or not output['taobaoke_items']:
            logger.debug("No output %s %s %s/%s/%s", input, shop_id, converted,
                         pos, total)
            continue
        succ_len = len(output['taobaoke_items']['taobaoke_item'])
        logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" %
                    (shop_id, converted, pos, total, len(input),
                     output['total_results'], succ_len))
        converted += succ_len
        Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len)
        try:
            numid2id = dict([(int(num_id), id)
                             for id, num_id, shop_type, jn_url, re_url in input
                             ])
            for result in output['taobaoke_items']['taobaoke_item']:
                isql = ""
                try:
                    num_iid = result['num_iid']
                    click_url = result[
                        'click_url'] + "&u=re_UCTRAC_CLK_&unid=re_UCTRAC_CLK_"
                    # conver spm to xtao
                    if click_url.find('spm=') > 0:
                        click_url = SPMRE.subn(r'\g<1>2014.21315963.1.0\g<3>',
                                               click_url)[0]
                    else:
                        click_url += '&spm=2014.21315963.1.0'
                    id = numid2id[num_iid]
                    isql = "insert into item_re (item_id, detail_url) values (%s, '%s') on duplicate key update detail_url='%s'" % (
                        id, click_url, click_url)
                    logger.debug("process %s %s/%s -> %s", shop_id, pos, total,
                                 isql)
                    if not FLAGS.dryrun:
                        db.execute(isql.replace('%', '%%'))
                except KeyboardInterrupt:
                    raise
                except Exception, e:
                    logger.debug("in %s out %s" % (numid2id, result))
                    logger.warn("convert failed %s %s" %
                                (isql, traceback.format_exc()))
        except KeyboardInterrupt:
            raise
        except:
            logger.warn("process failed %s %s reason %s" %
                        (input, output, traceback.format_exc()))
    logger.info("Convert result %s - %s", converted, total)

    # retry sql
    results = db.connect().execute(sql + limitsql)
    for row in filter_tbk_items(results):
        id, num_id, shop_type, jn_url, re_url = row
        if not re_url:
            sql = "insert into item_re (item_id, detail_url) values (%s, '%s')" % (
                id,
                'http://item.taobao.com/item.htm?id=%s&spm=2014.21315963.1.0' %
                num_id)
            db.execute(sql)
示例#16
0
def crawl_item2(kwargs):
    #signal.signal(signal.SIGINT, signal.SIG_IGN)
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    crawl_path = kwargs['crawl_path']
    server_path = kwargs['server_path']
    org_server_path = kwargs['org_server_path']
    is_remove = kwargs['is_remove']

    item_id = item[0]
    num_id = item[1]
    is_success = False
    crawl_result = ((item_id, {
        'suc1': 0,
        'count1': 0,
        'suc': 0,
        'count': 0
    }), )
    try:
        conn = get_db_engine(**kwargs).connect()
        try:
            items = conn.execute(
                "select html, desc_content from crawl_html where crawl_html.item_id=%s;"
                % item_id)
            result = list(items)
            if len(result) == 1:
                html = result[0][0]
                desc_content = result[0][1]

                html_obj = parse_html(html)
                thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src")
                if len(thumbImages) == 0:
                    thumbImages = [
                        IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in
                        html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")
                    ]
                    # taobao @src to @data-src
                    if not len(thumbImages):
                        thumbImages = html_obj.xpath(
                            "//ul[@id='J_UlThumb']//img/@data-src")

                if len(thumbImages) == 0:
                    logger.error(
                        "crawl item %s %s not found thumb images html size %s",
                        item_id,
                        num_id,
                        len(html),
                        extra={'tags': [
                            'crawl_thumb_empty',
                        ]})
                    return crawl_result

                r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M | re.S)
                tr = re.compile("(.*)_\d+x\d+\.jpg$")
                desc_thumbs = lazy_desc_thumbs = []
                if desc_content:
                    desc_html = r.subn(r'\2', desc_content)[0]
                    desc_html_obj = parse_html(desc_html)
                    if desc_html_obj is not None:
                        desc_thumbs = desc_html_obj.xpath(
                            "//*[not(@href)]/img[not(@data-ks-lazyload)]/@src")
                        lazy_desc_thumbs = desc_html_obj.xpath(
                            "//*[not(@href)]/img/@data-ks-lazyload")
                else:
                    logger.warn("crawl item %s %s desc content is empty!",
                                item_id,
                                num_id,
                                extra={'tags': [
                                    'crawl_nodesc',
                                ]})

                images = []
                pos = 1
                for url in thumbImages:
                    images.append((tr.sub(r'\1', url), pos, 1))
                    pos += 1
                for url in desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 2))
                        pos += 1
                for url in lazy_desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 3))
                        pos += 1

                logger.debug("crawling %s %s %s", item_id, num_id, images)
                item_crawler = ItemCrawler(item_id, num_id, crawl_path,
                                           server_path, org_server_path,
                                           kwargs['statshost'],
                                           kwargs['statsport'])
                item_crawler.crawl(images, ((710, 10000), ), is_commit, conn,
                                   is_remove)
                is_success = item_crawler.success
                crawl_result = ((item_id, item_crawler.summary), )
        except Exception, e:
            logger.error("crawl item %s %s got exception %s",
                         item_id,
                         num_id,
                         traceback.format_exc(),
                         extra={'tags': [
                             'crawl_exception',
                         ]})
        finally:
            conn.close()
        Statsd.update_stats("guang.crawl.downimgcount",
                            crawl_result[0][1]['suc1'] +
                            crawl_result[0][1]['suc'],
                            host=kwargs['statshost'],
                            port=kwargs['statsport'])
        if is_success:
            logger.info("crawl item %s %s success %s", item_id, num_id,
                        crawl_result)
            Statsd.increment('guang.crawl.itemimg.succ',
                             host=kwargs['statshost'],
                             port=kwargs['statsport'])
        else:
            logger.warn("crawl item %s %s failed %s",
                        item_id,
                        num_id,
                        crawl_result,
                        extra={'tags': [
                            'crawl_failed',
                        ]})
            Statsd.increment('guang.crawl.itemimg.failed',
                             host=kwargs['statshost'],
                             port=kwargs['statsport'])
示例#17
0
def crawl_item2(kwargs):
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    is_success = False
    item_id = item[0]
    num_id = item[2]
    crawl_result = ((item_id, (0, 0, 0, 0, 0, 0.0, 0)), )

    tb = TaobaoHtml(item_id, num_id, max_comments=kwargs['max_comments'])

    db = None
    if is_commit:
        db = get_db_engine()

    try:
        logger.info("progress %s/%s id %s iid %s", kwargs['i'],
                    kwargs['total'], item_id, num_id)
        tb.crawl()
        if tb.is_offline and is_commit:
            db.execute("update item set status=2, modified=now() where id=%s" %
                       item_id)
        if tb.detailDiv and not tb.is_offline:
            tb.crawl_price()

            if is_commit:
                # check old price and volume
                pv = list(
                    db.execute("select price, volume from item where id=%s",
                               item_id))
                price = pv[0][0]
                volume = pv[0][1]
                if tb.price != price and tb.price > 0.001:
                    is_price_update = True
                else:
                    is_price_update = False
                if tb.volume > 0 and tb.volume != volume:
                    is_volume_update = True
                else:
                    is_volume_update = False

                if is_price_update:
                    db.execute(
                        "insert into price_update_track (item_id,time) values (%s,now()) on duplicate key update time=now()"
                        % item_id)
                    if is_volume_update:
                        db.execute(
                            "update item set price=%s, volume=%s where id=%s",
                            tb.price, tb.volume, item_id)
                    else:
                        db.execute("update item set price=%s where id=%s",
                                   tb.price, item_id)
                elif is_volume_update:
                    db.execute("update item set volume=%s where id=%s",
                               tb.volume, item_id)
                if is_price_update:
                    Statsd.increment("taobao.crawl.price_update")
                if is_volume_update:
                    Statsd.increment("taobao.crawl.volume_update")

            if FLAGS.update_main:
                tb.crawl_desc()

                if len(tb.thumbImages) > 0 and is_commit and FLAGS.commit_html:
                    db.execute("delete from crawl_html where item_id=%s" %
                               item_id)
                    db.execute(
                        "insert into crawl_html (item_id,desc_url,promo_url,html,desc_content,promo_content,result,reason) values (%s, %s, %s, %s, %s, %s, %s, %s)",
                        item_id, tb.descUrl, tb.promoteUrl,
                        tb.data.decode('gb18030').encode('utf8'),
                        tb.descContent.decode('gb18030').encode('utf8'),
                        tb.promoteContent.decode('gb18030').encode('utf8'), 1,
                        "")
                    db.execute("update item set crawl_status=1 where id=%s" %
                               item_id)
                    Statsd.increment("taobao.crawl.html_update")

            ############### processing comments ###########
            if FLAGS.update_comments:
                rediscli = get_redis(FLAGS.redishost, FLAGS.redisport)
                key = "guang:rate:%s" % item_id
                l = rediscli.llen(key)
                tb.crawl_rate()
                logger.info("replace comments %s %s -> %s", item_id, l,
                            len(tb.comments))
                #rediscli.lrange(key, 0, l)
                rediscli.delete(key)
                for c in tb.comments:
                    rediscli.rpush(key, c.SerializeToString())
                    # if limit size
                    #p = rediscli.pipeline()
                    #p.rpush(key, c.SerializeToString())
                    #p.ltrim(0, 99)
                    #p.execute()
                Statsd.increment("taobao.crawl.comments_update")
                Statsd.update_stats("taobao.crawl.comments_update_total",
                                    len(tb.comments))

            is_success = True
            crawl_result = ((item_id, (len(tb.data), len(tb.promoteContent),
                                       len(tb.descContent),
                                       len(tb.thumbImages), len(tb.buyButton),
                                       tb.price, len(tb.comments))), )
        else:
            logger.warn("crawl %s failed, no detail content or is_offline=%s",
                        item_id, tb.is_offline)
            crawl_result = ((item_id, (len(tb.data), 0, 0, 0, 0, 0.0, 0)), )
    except:
        logger.error("crawling %s unknown exception %s",
                     item_id,
                     traceback.format_exc(),
                     extra={'tags': [
                         'crawlItemException',
                     ]})
    logger.info("crawling %s result %s - %s", item_id, is_success,
                crawl_result)
    if is_success:
        Statsd.increment("taobao.crawl.itemhtml.succ")
    else:
        Statsd.increment("taobao.crawl.itemhtml.failed")
    return crawl_result
示例#18
0
 def download_image(self, url):
     t = time.time()
     data = download(url, headers=self.headers)
     spent = time.time() - t
     Statsd.timing("guang.crawl.image", spent * 1000, host=self.statshost, port=self.statsport)
     return data
示例#19
0
def post_one(db, user, select_sql, update_succ_sql, update_fail_sql,
             table_prefix):
    now = datetime.datetime.now()
    next = datetime.datetime(2020, 1, 1)
    total = 0
    succ = 0
    failed = 0
    skip = 0

    logger.debug("querying %s", select_sql)
    results = db.execute(select_sql)
    logger.debug("processing post for user %s, total %s, sql %s", user,
                 results.rowcount, select_sql)
    for item in results:
        if FLAGS.postinterval:
            time.sleep(FLAGS.postinterval)
        total += 1
        logger.debug("processing post %s/%s/%s/%s for user %s", succ, skip,
                     total, results.rowcount, user)
        post_id = item[0]
        filename = item[2]
        post_ts = item[3]
        qqid = item[4]
        cookie_file = item[5]
        sid = item[6]
        #grplevel = item[7]
        content = preprocess_content(item[1], sid, post_ts, table_prefix,
                                     post_id)
        # reselect cookie
        cookie_result = list(
            db.execute("select cookies from wb_qq_account where qqid=%s" %
                       qqid))
        if cookie_result and cookie_result[0][0] != cookie_file:
            cookie_file = cookie_result[0][0]
        if post_ts <= now and cookie_file:
            logger.info("Preparing posting %s/%s %s qq %s sid %s %s %s @ %s",
                        total, results.rowcount, post_id, qqid, sid,
                        content.encode('utf8'), filename.encode('utf8'),
                        post_ts)
            result = post_shuoshuo_string(cookie_file,
                                          filename,
                                          content,
                                          sid=sid,
                                          post_id=post_id)
            if not FLAGS.dryrun:
                if result:
                    succ += 1
                    db.execute(update_succ_sql % post_id)
                    Statsd.increment('guang.qzonepost.succ')
                elif FLAGS.commitfail:
                    failed += 1
                    db.execute(update_fail_sql % post_id)
                    Statsd.increment('guang.qzonepost.fail')
                    log_paperboy("post timeout xks %s post_id %s" %
                                 (sid, post_id))
        else:
            if FLAGS.timer:
                logger.info(
                    "Preparing posting timer %s/%s %s qq %s sid %s %s %s @ %s",
                    total, results.rowcount, post_id, qqid, sid,
                    content.encode('utf8'), filename.encode('utf8'), post_ts)
                result = post_shuoshuo_string(
                    cookie_file,
                    filename,
                    content,
                    sid=sid,
                    schedule_ts=int(time.mktime(post_ts.timetuple())),
                    post_id=post_id)
                if not FLAGS.dryrun:
                    if result:
                        succ += 1
                        db.execute(update_succ_sql % post_id)
                        Statsd.increment('guang.qzonepost.succ')
                    else:
                        skip += 1
                        next = min(post_ts, next)
                        Statsd.increment('guang.qzonepost.timerfail')
            else:
                skip += 1
                next = min(post_ts, next)
                logger.debug("Skiping post %s %s, scheduled @ %s",
                             content.encode('utf8'), filename.encode('utf8'),
                             post_ts)
    if total > 0:
        logger.info(
            "Batch result total %s skip %s succ %s failed %s next schedule %s",
            total, skip, succ, failed, next)