Exemplo n.º 1
0
def update_taobao_volume(db, shop_id, shop_type, total_items):
    # fetch current volumes & price
    logger.info("fetching current volumes %s", shop_id)
    do_query(db, "select num_id, volume, price from item where shop_id=%s" % shop_id)
    results = db.store_result()
    iid_volumes = {}
    for row in results.fetch_row(maxrows=0):
        iid_volumes[row[0]] = row[1]

    # update taobao volume, not tmall
    if FLAGS.updatevolume and shop_type == 1:
        db.autocommit(False)
        db.query("set autocommit=0;")
        # update volume
        logger.info("updating item volume %s", shop_id)
        for item in total_items:
            new_value = item['sales_amount']
            old_value = iid_volumes.get(item['iid'], 0) or 0
            diff_v = abs(new_value - old_value)
            if not iid_volumes.has_key(item['iid']):
                continue
                # 10% or 20 changed, update
            if new_value > 0 and new_value != old_value and (
                            old_value == 0 or diff_v > 20 or diff_v * 1.0 / old_value > 0.1):
                logger.debug("updating item %s %s %s -> %s", shop_id, item['iid'], old_value, new_value)
                do_query(db,
                         "update item set volume=%s where num_id=%s and shop_id=%s" % (new_value, item['iid'], shop_id))
                Statsd.increment('guang.crawl.volume_update_onlist')
        db.commit()

    # TODO: update tmall total volumes
    if FLAGS.updatevolume and shop_type == 2:
        pass
Exemplo n.º 2
0
def convert_taobaoke_widget(items, fn_join_iids=lambda x:','.join(x), batch_size=40, calllimit=60, expire=600, outer_code='jcn', appkey=TAOBAOKE_APPKEY, appsec=TAOBAOKE_APPSECRET):
    ts = int(time.time()*1000)
    msg = appsec + 'app_key' + str(appkey) + "timestamp" + str(ts) + appsec
    sign = hmac.HMAC(appsec, msg).hexdigest().upper()
    headers = {'User-Agent' : DEFAULT_UA, 'Referer' : "http://www.j.cn/"}
    for chunk in waitlimit(calllimit, 60.0, chunks(items, batch_size)): # calllimit for minutes
        params = {'app_key' : appkey,
                  '_t_sys' : 'args=4',
                  'method' : 'taobao.taobaoke.widget.items.convert',
                  'sign' : sign,
                  'timestamp' : ts,
                  'fields' : "num_iid,nick,price,click_url,commission,commission_rate,commission_num,commission_volume,shop_click_url,seller_credit_score",
                  'callback' : 'TOP.io.jsonpCbs.t%s' % md5( str(random.random()) ).hexdigest()[:13],
                  'partner_id' : 'top-sdk-js-20120801',
        }
        params['num_iids'] = fn_join_iids(chunk)
        if outer_code:
            params['outer_code'] = outer_code
        url = "http://gw.api.taobao.com/widget/rest?%s" % urllib.urlencode(params)
        results = download(url, headers=headers)
        if results:
            Statsd.increment('guang.taobaoapi.widget_succ')
        else:
            Statsd.increment('guang.taobaoapi.widget_err')
        #logger.debug('Calling %s(%s) -> %s', request.method_name, request.api_params, results)
        yield (chunk, results)
Exemplo n.º 3
0
def crawl_one_shop(shop, failed):
    try:
        is_commit = shop['is_commit']
        shop_id = shop['shop'][0]
        shop_url = shop['shop'][1]
        shop_type = shop['shop'][4]
        shop_nick = shop['shop'][5]

        tb = TaobaoListHtml(shop_id, shop_url)
        tb.crawl()
        logger.debug("crawl result %s count %s total %s", tb.id, tb.count, len(tb.total_items))

        if is_commit:
            batch_size=100
            total_items = tb.total_items

            db = get_rawdb_conn()
            update_shop_items(batch_size, db, shop_id, total_items)
            update_taobao_volume(db, shop_id, shop_type, total_items)
            db.close()

            Statsd.increment('guang.crawl.shop_list_succ')
    except ShopOfflineException:
        #double check shop status by taobao api
        shopinfo = get_taobao_shops(get_rand_top(), shop_nick)
        if shopinfo.get('error', 0) == 560 and is_commit:
            db = get_rawdb_conn()
            do_query(db, "update shop set status=2 where id=%s" % shop_id)
            db.commit()
            db.close()
    except:
        Statsd.increment('guang.crawl.shop_list_failed')
        logger.error("crawl shop failed %s %s", shop_id, traceback.format_exc(), extra={'tags':['crawlShopException',]})
        failed.append({'shopid':shop_id, 'err':traceback.format_exc()})
Exemplo n.º 4
0
def post_one(db, user, select_sql, update_succ_sql, update_fail_sql, table_prefix):
    now = datetime.datetime.now()
    next = datetime.datetime(2020, 1, 1)
    total = 0
    succ = 0
    failed = 0
    skip = 0

    logger.debug("querying %s", select_sql)
    results = db.execute(select_sql)
    logger.debug("processing post for user %s, total %s, sql %s", user, results.rowcount, select_sql)
    for item in results:
        if FLAGS.postinterval:
            time.sleep(FLAGS.postinterval)
        total += 1
        logger.debug("processing post %s/%s/%s/%s for user %s", succ, skip, total, results.rowcount, user)
        post_id = item[0]
        filename = item[2]
        post_ts = item[3]
        qqid = item[4]
        cookie_file = item[5]
        sid = item[6]
        #grplevel = item[7]
        content = preprocess_content(item[1], sid, post_ts, table_prefix, post_id)
        # reselect cookie
        cookie_result = list(db.execute("select cookies from wb_qq_account where qqid=%s" % qqid))
        if cookie_result and cookie_result[0][0] != cookie_file:
            cookie_file = cookie_result[0][0]
        if post_ts <= now and cookie_file:
            logger.info("Preparing posting %s/%s %s qq %s sid %s %s %s @ %s", total, results.rowcount, post_id, qqid, sid, content.encode('utf8'), filename.encode('utf8'), post_ts)
            result = post_shuoshuo_string(cookie_file, filename, content, sid=sid, post_id=post_id)
            if not FLAGS.dryrun:
                if result:
                    succ += 1
                    db.execute(update_succ_sql % post_id)
                    Statsd.increment('guang.qzonepost.succ')
                elif FLAGS.commitfail:
                    failed += 1
                    db.execute(update_fail_sql % post_id)
                    Statsd.increment('guang.qzonepost.fail')
                    log_paperboy("post timeout xks %s post_id %s" % (sid, post_id))
        else:
            if FLAGS.timer:
                logger.info("Preparing posting timer %s/%s %s qq %s sid %s %s %s @ %s", total, results.rowcount, post_id, qqid, sid, content.encode('utf8'), filename.encode('utf8'), post_ts)
                result = post_shuoshuo_string(cookie_file, filename, content, sid=sid, schedule_ts=int(time.mktime(post_ts.timetuple())), post_id=post_id)
                if not FLAGS.dryrun:
                    if result:
                        succ += 1
                        db.execute(update_succ_sql % post_id)
                        Statsd.increment('guang.qzonepost.succ')
                    else:
                        skip += 1
                        next = min(post_ts, next)
                        Statsd.increment('guang.qzonepost.timerfail')
            else:
                skip += 1
                next = min(post_ts, next)
                logger.debug("Skiping post %s %s, scheduled @ %s", content.encode('utf8'), filename.encode('utf8'), post_ts)
    if total > 0:
        logger.info("Batch result total %s skip %s succ %s failed %s next schedule %s", total, skip, succ, failed, next)
Exemplo n.º 5
0
def crawl_one_shop(shop, failed):
    try:
        is_commit = shop['is_commit']
        shop_id = shop['shop'][0]
        shop_url = shop['shop'][1]
        shop_type = shop['shop'][4]
        shop_nick = shop['shop'][5].encode('utf-8')

        tb = TaobaoListHtml(shop_id, shop_url)
        tb.crawl()
        logger.debug("crawl result %s count %s total %s", tb.id, tb.count,
                     len(tb.total_items))

        if is_commit:
            batch_size = 100
            total_items = tb.total_items

            db = get_rawdb_conn()
            update_shop_items(batch_size, db, shop_id, total_items)
            update_taobao_volume(db, shop_id, shop_type, total_items)
            db.close()

            Statsd.increment('guang.crawl.shop_list_succ')
    except ShopOfflineException:
        #double check shop status by taobao api
        shopinfo = get_taobao_shops(get_rand_top(), [shop_nick])
        if not shopinfo and is_commit:
            """
            db = get_rawdb_conn()
            do_query(db, "update shop set status=2 where id=%s" % shop_id)
            db.commit()
            db.close()
            """
            logger.warning("Shop %s: %s not is taobaoke", shop_id, shop_url)
        else:
            logger.error("Shop %s: %s url is error!", shop_id, shop_url)
    except:
        Statsd.increment('guang.crawl.shop_list_failed')
        logger.error("crawl shop failed %s %s",
                     shop_id,
                     traceback.format_exc(),
                     extra={'tags': [
                         'crawlShopException',
                     ]})
        failed.append({'shopid': shop_id, 'err': traceback.format_exc()})
Exemplo n.º 6
0
def convert_taobaoke_widget(items,
                            fn_join_iids=lambda x: ','.join(x),
                            batch_size=40,
                            calllimit=60,
                            expire=600,
                            outer_code='jcn',
                            appkey=TAOBAOKE_APPKEY,
                            appsec=TAOBAOKE_APPSECRET):
    ts = int(time.time() * 1000)
    msg = appsec + 'app_key' + str(appkey) + "timestamp" + str(ts) + appsec
    sign = hmac.HMAC(appsec, msg).hexdigest().upper()
    headers = {'User-Agent': DEFAULT_UA, 'Referer': "http://www.j.cn/"}
    for chunk in waitlimit(calllimit, 60.0,
                           chunks(items, batch_size)):  # calllimit for minutes
        params = {
            'app_key':
            appkey,
            '_t_sys':
            'args=4',
            'method':
            'taobao.taobaoke.widget.items.convert',
            'sign':
            sign,
            'timestamp':
            ts,
            'fields':
            "num_iid,nick,price,click_url,commission,commission_rate,commission_num,commission_volume,shop_click_url,seller_credit_score",
            'callback':
            'TOP.io.jsonpCbs.t%s' % md5(str(random.random())).hexdigest()[:13],
            'partner_id':
            'top-sdk-js-20120801',
        }
        params['num_iids'] = fn_join_iids(chunk)
        if outer_code:
            params['outer_code'] = outer_code
        url = "http://gw.api.taobao.com/widget/rest?%s" % urllib.urlencode(
            params)
        results = download(url, headers=headers)
        if results:
            Statsd.increment('guang.taobaoapi.widget_succ')
        else:
            Statsd.increment('guang.taobaoapi.widget_err')
        #logger.debug('Calling %s(%s) -> %s', request.method_name, request.api_params, results)
        yield (chunk, results)
Exemplo n.º 7
0
def update_taobao_volume(db, shop_id, shop_type, total_items):
    # fetch current volumes & price
    logger.info("fetching current volumes %s", shop_id)
    do_query(
        db,
        "select num_id, volume, price from item where shop_id=%s" % shop_id)
    results = db.store_result()
    iid_volumes = {}
    for row in results.fetch_row(maxrows=0):
        iid_volumes[row[0]] = row[1]

    # update taobao volume, not tmall
    if FLAGS.updatevolume and shop_type == 1:
        db.autocommit(False)
        db.query("set autocommit=0;")
        # update volume
        logger.info("updating item volume %s", shop_id)
        for item in total_items:
            new_value = item['sales_amount']
            old_value = iid_volumes.get(item['iid'], 0) or 0
            diff_v = abs(new_value - old_value)
            if not iid_volumes.has_key(item['iid']):
                continue
                # 10% or 20 changed, update
            if new_value > 0 and new_value != old_value and (
                    old_value == 0 or diff_v > 20
                    or diff_v * 1.0 / old_value > 0.1):
                logger.debug("updating item %s %s %s -> %s", shop_id,
                             item['iid'], old_value, new_value)
                do_query(
                    db,
                    "update item set volume=%s where num_id=%s and shop_id=%s"
                    % (new_value, item['iid'], shop_id))
                Statsd.increment('guang.crawl.volume_update_onlist')
        db.commit()

    # TODO: update tmall total volumes
    if FLAGS.updatevolume and shop_type == 2:
        pass
Exemplo n.º 8
0
def try_execute(top, request, expire=600.0):
    current = time.time()
    interval = 60.0  # wait for 1 min
    ban_retry_count = 0
    http_retry_count = 0
    http_max_retry = 3
    while True:
        try:
            if FLAGS.debug_topapi:
                import pdb
                pdb.set_trace()
            result = top.execute(request)
            Statsd.increment('guang.taobaoapi.%s.succ' %
                             request.method_name.replace('.', '_'))
            logger.debug("calling %s(%s) --> %s", request.method_name,
                         request.api_params, result)
            return result
        except requests.exceptions.ConnectionError, e1:
            logger.warn("Call api http failed %s", traceback.format_exc())
            Statsd.increment('guang.taobaoapi.%s.conn_err' %
                             request.method_name.replace('.', '_'))
            http_retry_count += 1
            if http_retry_count > http_max_retry:
                return None
            else:
                time.sleep(interval)
        except TOPException, e:
            logger.warn("Call api top failed %s", traceback.format_exc())
            Statsd.increment('guang.taobaoapi.%s.api_err' %
                             request.method_name.replace('.', '_'))
            if e.code in [4, 5, 6, 7,
                          8]:  #  This ban will last for 71 more seconds
                m = R_ERROR_MSG.match(e.message)  # e.args[0]
                if m:
                    try:
                        interval = int(m.group(1)) + 10.0
                    except:
                        interval = 60.0
                if ban_retry_count > 0:
                    interval += 60.0 * ban_retry_count
                ban_retry_count += 1
                logger.info("Waiting and try after %s", interval)
                time.sleep(interval)
                if time.time() - current > expire:
                    logger.error("call %s timeout %s" %
                                 (request.method_name, time.time() - current))
                    return None
            elif e.code == 560:  # 查询不到对应的用户信息 (code=560)
                return {'error': 560}
            else:
                return None
Exemplo n.º 9
0
def try_execute(top, request, expire=600.0):
    current = time.time()
    interval = 60.0 # wait for 1 min
    ban_retry_count = 0
    http_retry_count = 0
    http_max_retry = 3
    while True:
        try:
            if FLAGS.debug_topapi:
                import pdb; pdb.set_trace()
            result = top.execute(request)
            Statsd.increment('guang.taobaoapi.%s.succ' % request.method_name.replace('.', '_'))
            logger.debug("calling %s(%s) --> %s", request.method_name, request.api_params, result)
            return result
        except requests.exceptions.ConnectionError, e1:
            logger.warn("Call api http failed %s", traceback.format_exc())
            Statsd.increment('guang.taobaoapi.%s.conn_err' % request.method_name.replace('.', '_'))
            http_retry_count += 1
            if http_retry_count > http_max_retry:
                return None
            else:
                time.sleep(interval)
        except TOPException, e:
            logger.warn("Call api top failed %s", traceback.format_exc())
            Statsd.increment('guang.taobaoapi.%s.api_err' % request.method_name.replace('.', '_'))
            if e.code in [4, 5, 6, 7, 8]: #  This ban will last for 71 more seconds
                m = R_ERROR_MSG.match(e.message) # e.args[0]
                if m:
                    try:
                        interval = int(m.group(1)) + 10.0
                    except:
                        interval = 60.0
                if ban_retry_count > 0:
                    interval += 60.0*ban_retry_count
                ban_retry_count += 1
                logger.info("Waiting and try after %s", interval)
                time.sleep(interval)
                if time.time() - current > expire:
                    logger.error("call %s timeout %s" % (request.method_name, time.time()-current))
                    return None
            elif e.code == 560: # 查询不到对应的用户信息 (code=560)
                return {'error':560}
            else:
                return None
Exemplo n.º 10
0
def crawl_item2(kwargs):
    #signal.signal(signal.SIGINT, signal.SIG_IGN)
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    crawl_path = kwargs['crawl_path']
    server_path = kwargs['server_path']
    org_server_path = kwargs['org_server_path']
    is_remove = kwargs['is_remove']

    item_id = item[0]
    num_id = item[1]
    is_success = False
    crawl_result = ((item_id, {'suc1': 0, 'count1': 0, 'suc': 0, 'count': 0}),)
    try:
        conn = get_db_engine(**kwargs).connect()
        try:
            items = conn.execute("select html, desc_content from crawl_html where crawl_html.item_id=%s;" % item_id)
            result = list(items)
            if len(result) == 1:
                html = result[0][0]
                desc_content = result[0][1] 
                html_obj = parse_html(html)
                thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src")
                if len(thumbImages) == 0:
                    thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")]
                    # taobao @src to @data-src
                    if not len(thumbImages):
                        thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src")

                if len(thumbImages) == 0:
                    logger.error("crawl item %s %s not found thumb images html size %s", item_id, num_id, len(html), extra={'tags':['crawl_thumb_empty',]})
                    return crawl_result

                r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M|re.S)
                tr = re.compile("(.*)_\d+x\d+\.jpg$")
                tr_new = re.compile("(.+\.(jpg|png|gif))[^$]*.jpg$")
                desc_thumbs = desc_table_thumbs = lazy_desc_thumbs = []
                if desc_content:
                    desc_html = r.subn(r'\2', desc_content)[0]
                    desc_html_obj = parse_html(desc_html)
                    if desc_html_obj is not None:
                        desc_table_thumbs = desc_html_obj.xpath("//table/@background")
                        desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img[not(@data-ks-lazyload)]/@src")
                        lazy_desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img/@data-ks-lazyload")
                else:
                    logger.warn("crawl item %s %s desc content is empty!", item_id, num_id, extra={'tags':['crawl_nodesc',]})

                images = []
                pos = 1
                for url in thumbImages:
                    ori_url = None
                    if tr.match(url):
                        ori_url = tr.sub(r'\1', url)
                    else:
                        if tr_new.match(url):
                            ori_url = tr_new.sub(r'\1', url)
                        else:
                            logger.error("crawl item %s %s thumb image urls can not be parsed!", item_id, num_id, extra={'tags':['crawl_exception',]})

                    images.append((ori_url, pos, 1))
                    pos += 1
                for url in desc_table_thumbs:
                    images.append((url, pos, 2))
                    pos += 1
                for url in desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 2))
                        pos += 1
                for url in lazy_desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 3))
                        pos += 1

                logger.debug("crawling %s %s %s", item_id, num_id, images)
                item_crawler = ItemCrawler(item_id, num_id, crawl_path, server_path, org_server_path, kwargs['statshost'], kwargs['statsport'])
                item_crawler.crawl(images, ((710,10000),), is_commit, conn, is_remove)
                is_success = item_crawler.success
                crawl_result = ((item_id, item_crawler.summary),)
        except Exception, e:
            logger.error("crawl item %s %s got exception %s", item_id, num_id, traceback.format_exc(), extra={'tags':['crawl_exception',]})
        finally:
            conn.close()
        Statsd.update_stats("guang.crawl.downimgcount", crawl_result[0][1]['suc1'] + crawl_result[0][1]['suc'],
            host = kwargs['statshost'], port = kwargs['statsport'])
        if is_success:
            logger.info("crawl item %s %s success %s", item_id, num_id, crawl_result)
            Statsd.increment('guang.crawl.itemimg.succ', host = kwargs['statshost'], port = kwargs['statsport'])
        else:
            logger.warn("crawl item %s %s failed %s", item_id, num_id, crawl_result, extra={'tags':['crawl_failed',]})
            Statsd.increment('guang.crawl.itemimg.failed', host = kwargs['statshost'], port = kwargs['statsport'])
Exemplo n.º 11
0
def crawl_item2(kwargs):
    #signal.signal(signal.SIGINT, signal.SIG_IGN)
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    crawl_path = kwargs['crawl_path']
    server_path = kwargs['server_path']
    org_server_path = kwargs['org_server_path']
    is_remove = kwargs['is_remove']

    item_id = item[0]
    num_id = item[1]
    is_success = False
    crawl_result = ((item_id, {
        'suc1': 0,
        'count1': 0,
        'suc': 0,
        'count': 0
    }), )
    try:
        conn = get_db_engine(**kwargs).connect()
        try:
            items = conn.execute(
                "select html, desc_content from crawl_html where crawl_html.item_id=%s;"
                % item_id)
            result = list(items)
            if len(result) == 1:
                html = result[0][0]
                desc_content = result[0][1]

                html_obj = parse_html(html)
                thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src")
                if len(thumbImages) == 0:
                    thumbImages = [
                        IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in
                        html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")
                    ]
                    # taobao @src to @data-src
                    if not len(thumbImages):
                        thumbImages = html_obj.xpath(
                            "//ul[@id='J_UlThumb']//img/@data-src")

                if len(thumbImages) == 0:
                    logger.error(
                        "crawl item %s %s not found thumb images html size %s",
                        item_id,
                        num_id,
                        len(html),
                        extra={'tags': [
                            'crawl_thumb_empty',
                        ]})
                    return crawl_result

                r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M | re.S)
                tr = re.compile("(.*)_\d+x\d+\.jpg$")
                desc_thumbs = lazy_desc_thumbs = []
                if desc_content:
                    desc_html = r.subn(r'\2', desc_content)[0]
                    desc_html_obj = parse_html(desc_html)
                    if desc_html_obj is not None:
                        desc_thumbs = desc_html_obj.xpath(
                            "//*[not(@href)]/img[not(@data-ks-lazyload)]/@src")
                        lazy_desc_thumbs = desc_html_obj.xpath(
                            "//*[not(@href)]/img/@data-ks-lazyload")
                else:
                    logger.warn("crawl item %s %s desc content is empty!",
                                item_id,
                                num_id,
                                extra={'tags': [
                                    'crawl_nodesc',
                                ]})

                images = []
                pos = 1
                for url in thumbImages:
                    images.append((tr.sub(r'\1', url), pos, 1))
                    pos += 1
                for url in desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 2))
                        pos += 1
                for url in lazy_desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 3))
                        pos += 1

                logger.debug("crawling %s %s %s", item_id, num_id, images)
                item_crawler = ItemCrawler(item_id, num_id, crawl_path,
                                           server_path, org_server_path,
                                           kwargs['statshost'],
                                           kwargs['statsport'])
                item_crawler.crawl(images, ((710, 10000), ), is_commit, conn,
                                   is_remove)
                is_success = item_crawler.success
                crawl_result = ((item_id, item_crawler.summary), )
        except Exception, e:
            logger.error("crawl item %s %s got exception %s",
                         item_id,
                         num_id,
                         traceback.format_exc(),
                         extra={'tags': [
                             'crawl_exception',
                         ]})
        finally:
            conn.close()
        Statsd.update_stats("guang.crawl.downimgcount",
                            crawl_result[0][1]['suc1'] +
                            crawl_result[0][1]['suc'],
                            host=kwargs['statshost'],
                            port=kwargs['statsport'])
        if is_success:
            logger.info("crawl item %s %s success %s", item_id, num_id,
                        crawl_result)
            Statsd.increment('guang.crawl.itemimg.succ',
                             host=kwargs['statshost'],
                             port=kwargs['statsport'])
        else:
            logger.warn("crawl item %s %s failed %s",
                        item_id,
                        num_id,
                        crawl_result,
                        extra={'tags': [
                            'crawl_failed',
                        ]})
            Statsd.increment('guang.crawl.itemimg.failed',
                             host=kwargs['statshost'],
                             port=kwargs['statsport'])
Exemplo n.º 12
0
def crawl_item2(kwargs):
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    is_success = False
    item_id = item[0]
    num_id = item[2]
    crawl_result = ((item_id, (0, 0, 0, 0, 0, 0.0, 0)), )

    tb = TaobaoHtml(item_id, num_id, max_comments=kwargs['max_comments'])

    db = None
    if is_commit:
        db = get_db_engine()

    try:
        logger.info("progress %s/%s id %s iid %s", kwargs['i'],
                    kwargs['total'], item_id, num_id)
        tb.crawl()
        if tb.is_offline and is_commit:
            db.execute("update item set status=2, modified=now() where id=%s" %
                       item_id)
        if tb.detailDiv and not tb.is_offline:
            tb.crawl_price()

            if is_commit:
                # check old price and volume
                pv = list(
                    db.execute("select price, volume from item where id=%s",
                               item_id))
                price = pv[0][0]
                volume = pv[0][1]
                if tb.price != price and tb.price > 0.001:
                    is_price_update = True
                else:
                    is_price_update = False
                if tb.volume > 0 and tb.volume != volume:
                    is_volume_update = True
                else:
                    is_volume_update = False

                if is_price_update:
                    db.execute(
                        "insert into price_update_track (item_id,time) values (%s,now()) on duplicate key update time=now()"
                        % item_id)
                    if is_volume_update:
                        db.execute(
                            "update item set price=%s, volume=%s where id=%s",
                            tb.price, tb.volume, item_id)
                    else:
                        db.execute("update item set price=%s where id=%s",
                                   tb.price, item_id)
                elif is_volume_update:
                    db.execute("update item set volume=%s where id=%s",
                               tb.volume, item_id)
                if is_price_update:
                    Statsd.increment("taobao.crawl.price_update")
                if is_volume_update:
                    Statsd.increment("taobao.crawl.volume_update")

            if FLAGS.update_main:
                tb.crawl_desc()

                if len(tb.thumbImages) > 0 and is_commit and FLAGS.commit_html:
                    db.execute("delete from crawl_html where item_id=%s" %
                               item_id)
                    db.execute(
                        "insert into crawl_html (item_id,desc_url,promo_url,html,desc_content,promo_content,result,reason) values (%s, %s, %s, %s, %s, %s, %s, %s)",
                        item_id, tb.descUrl, tb.promoteUrl,
                        tb.data.decode('gb18030').encode('utf8'),
                        tb.descContent.decode('gb18030').encode('utf8'),
                        tb.promoteContent.decode('gb18030').encode('utf8'), 1,
                        "")
                    db.execute("update item set crawl_status=1 where id=%s" %
                               item_id)
                    Statsd.increment("taobao.crawl.html_update")

            ############### processing comments ###########
            if FLAGS.update_comments:
                rediscli = get_redis(FLAGS.redishost, FLAGS.redisport)
                key = "guang:rate:%s" % item_id
                l = rediscli.llen(key)
                tb.crawl_rate()
                logger.info("replace comments %s %s -> %s", item_id, l,
                            len(tb.comments))
                #rediscli.lrange(key, 0, l)
                rediscli.delete(key)
                for c in tb.comments:
                    rediscli.rpush(key, c.SerializeToString())
                    # if limit size
                    #p = rediscli.pipeline()
                    #p.rpush(key, c.SerializeToString())
                    #p.ltrim(0, 99)
                    #p.execute()
                Statsd.increment("taobao.crawl.comments_update")
                Statsd.update_stats("taobao.crawl.comments_update_total",
                                    len(tb.comments))

            is_success = True
            crawl_result = ((item_id, (len(tb.data), len(tb.promoteContent),
                                       len(tb.descContent),
                                       len(tb.thumbImages), len(tb.buyButton),
                                       tb.price, len(tb.comments))), )
        else:
            logger.warn("crawl %s failed, no detail content or is_offline=%s",
                        item_id, tb.is_offline)
            crawl_result = ((item_id, (len(tb.data), 0, 0, 0, 0, 0.0, 0)), )
    except:
        logger.error("crawling %s unknown exception %s",
                     item_id,
                     traceback.format_exc(),
                     extra={'tags': [
                         'crawlItemException',
                     ]})
    logger.info("crawling %s result %s - %s", item_id, is_success,
                crawl_result)
    if is_success:
        Statsd.increment("taobao.crawl.itemhtml.succ")
    else:
        Statsd.increment("taobao.crawl.itemhtml.failed")
    return crawl_result
Exemplo n.º 13
0
def post_one(db, user, select_sql, update_succ_sql, update_fail_sql,
             table_prefix):
    now = datetime.datetime.now()
    next = datetime.datetime(2020, 1, 1)
    total = 0
    succ = 0
    failed = 0
    skip = 0

    logger.debug("querying %s", select_sql)
    results = db.execute(select_sql)
    logger.debug("processing post for user %s, total %s, sql %s", user,
                 results.rowcount, select_sql)
    for item in results:
        if FLAGS.postinterval:
            time.sleep(FLAGS.postinterval)
        total += 1
        logger.debug("processing post %s/%s/%s/%s for user %s", succ, skip,
                     total, results.rowcount, user)
        post_id = item[0]
        filename = item[2]
        post_ts = item[3]
        qqid = item[4]
        cookie_file = item[5]
        sid = item[6]
        #grplevel = item[7]
        content = preprocess_content(item[1], sid, post_ts, table_prefix,
                                     post_id)
        # reselect cookie
        cookie_result = list(
            db.execute("select cookies from wb_qq_account where qqid=%s" %
                       qqid))
        if cookie_result and cookie_result[0][0] != cookie_file:
            cookie_file = cookie_result[0][0]
        if post_ts <= now and cookie_file:
            logger.info("Preparing posting %s/%s %s qq %s sid %s %s %s @ %s",
                        total, results.rowcount, post_id, qqid, sid,
                        content.encode('utf8'), filename.encode('utf8'),
                        post_ts)
            result = post_shuoshuo_string(cookie_file,
                                          filename,
                                          content,
                                          sid=sid,
                                          post_id=post_id)
            if not FLAGS.dryrun:
                if result:
                    succ += 1
                    db.execute(update_succ_sql % post_id)
                    Statsd.increment('guang.qzonepost.succ')
                elif FLAGS.commitfail:
                    failed += 1
                    db.execute(update_fail_sql % post_id)
                    Statsd.increment('guang.qzonepost.fail')
                    log_paperboy("post timeout xks %s post_id %s" %
                                 (sid, post_id))
        else:
            if FLAGS.timer:
                logger.info(
                    "Preparing posting timer %s/%s %s qq %s sid %s %s %s @ %s",
                    total, results.rowcount, post_id, qqid, sid,
                    content.encode('utf8'), filename.encode('utf8'), post_ts)
                result = post_shuoshuo_string(
                    cookie_file,
                    filename,
                    content,
                    sid=sid,
                    schedule_ts=int(time.mktime(post_ts.timetuple())),
                    post_id=post_id)
                if not FLAGS.dryrun:
                    if result:
                        succ += 1
                        db.execute(update_succ_sql % post_id)
                        Statsd.increment('guang.qzonepost.succ')
                    else:
                        skip += 1
                        next = min(post_ts, next)
                        Statsd.increment('guang.qzonepost.timerfail')
            else:
                skip += 1
                next = min(post_ts, next)
                logger.debug("Skiping post %s %s, scheduled @ %s",
                             content.encode('utf8'), filename.encode('utf8'),
                             post_ts)
    if total > 0:
        logger.info(
            "Batch result total %s skip %s succ %s failed %s next schedule %s",
            total, skip, succ, failed, next)
Exemplo n.º 14
0
def crawl_item2(kwargs):
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    is_success = False
    item_id = item[0]
    num_id = item[2]
    crawl_result = ((item_id, (0,0,0,0,0,0.0,0)),)

    tb = TaobaoHtml(item_id, num_id, max_comments=kwargs['max_comments'])

    db = None
    if is_commit:
        db = get_db_engine()

    try:
        logger.info("progress %s/%s id %s iid %s", kwargs['i'], kwargs['total'], item_id, num_id)
        tb.crawl()
        if tb.is_offline and is_commit:
            db.execute("update item set status=2, modified=now() where id=%s" % item_id)
        if tb.detailDiv and not tb.is_offline:
            tb.crawl_price()

            if is_commit:
                # check old price and volume
                pv = list(db.execute("select price, volume from item where id=%s", item_id))
                price = pv[0][0]
                volume = pv[0][1]
                if tb.price != price and tb.price > 0.001:
                    is_price_update = True
                else:
                    is_price_update = False
                if tb.volume > 0 and tb.volume != volume:
                    is_volume_update = True
                else:
                    is_volume_update = False

                if is_price_update:
                    db.execute("insert into price_update_track (item_id,time) values (%s,now()) on duplicate key update time=now()" % item_id)
                    if is_volume_update:
                        db.execute("update item set price=%s, volume=%s where id=%s", tb.price, tb.volume, item_id)
                    else:
                        db.execute("update item set price=%s where id=%s", tb.price, item_id)
                elif is_volume_update:
                    db.execute("update item set volume=%s where id=%s", tb.volume, item_id)
                if is_price_update:
                    Statsd.increment("taobao.crawl.price_update")
                if is_volume_update:
                    Statsd.increment("taobao.crawl.volume_update")

            if FLAGS.update_main:
                tb.crawl_desc()

                if len(tb.thumbImages) > 0 and is_commit and FLAGS.commit_html:
                    db.execute("delete from crawl_html where item_id=%s" % item_id)
                    db.execute("insert into crawl_html (item_id,desc_url,promo_url,html,desc_content,promo_content,result,reason) values (%s, %s, %s, %s, %s, %s, %s, %s)", item_id, tb.descUrl, tb.promoteUrl, tb.data.decode('gb18030').encode('utf8'), tb.descContent.decode('gb18030').encode('utf8'), tb.promoteContent.decode('gb18030').encode('utf8'), 1, "")
                    db.execute("update item set crawl_status=1 where id=%s" % item_id)
                    Statsd.increment("taobao.crawl.html_update")

            ############### processing comments ###########
            if FLAGS.update_comments:
                rediscli = get_redis(FLAGS.redishost, FLAGS.redisport)
                key = "guang:rate:%s" % item_id
                l = rediscli.llen(key)
                tb.crawl_rate()
                logger.info("replace comments %s %s -> %s", item_id, l, len(tb.comments))
                #rediscli.lrange(key, 0, l)
                rediscli.delete(key)
                for c in tb.comments:
                    rediscli.rpush(key, c.SerializeToString())
                    # if limit size
                    #p = rediscli.pipeline()
                    #p.rpush(key, c.SerializeToString())
                    #p.ltrim(0, 99)
                    #p.execute()
                Statsd.increment("taobao.crawl.comments_update")
                Statsd.update_stats("taobao.crawl.comments_update_total", len(tb.comments))

            is_success = True
            crawl_result = ((item_id, (len(tb.data),len(tb.promoteContent),len(tb.descContent),len(tb.thumbImages),len(tb.buyButton),tb.price,len(tb.comments))),)
        else:
            logger.warn("crawl %s failed, no detail content or is_offline=%s", item_id, tb.is_offline)
            crawl_result = ((item_id, (len(tb.data),0,0,0,0,0.0,0)),)
    except:
        logger.error("crawling %s unknown exception %s", item_id, traceback.format_exc(), extra={'tags':['crawlItemException',]})
    logger.info("crawling %s result %s - %s", item_id, is_success, crawl_result)
    if is_success:
        Statsd.increment("taobao.crawl.itemhtml.succ")
    else:
        Statsd.increment("taobao.crawl.itemhtml.failed")
    return crawl_result