Python get_db_engine示例，pygaga.helpers.dbutils.get_db_engine Python示例

示例#1

0

显示文件

文件： crawl_meilishuo.py 项目： iloveyo123u1/tb-crawler

def crawl_all():
    login_params = {'emailaddress':'*****@*****.**',
    'password':'******',
    'type':'undefined',
    'wbid':'0',
    'savestat':'true'
#    'checkcode':'',
    }
    req = urllib2.Request('http://www.meilishuo.com/users/ajax_logon?frm=undefined', urllib.urlencode(login_params), headers)
    handle = urllib2.urlopen(req)
    logger.info("logged result %s", handle.read())

    if FLAGS.itemid:
        crawl_item(FLAGS.itemid)
    else:
        if FLAGS.group:
            start = FLAGS.group*1000000
            end = (FLAGS.group+1)*1000000
        else:
            start = FLAGS.start
            end = FLAGS.end
        db = get_db_engine()
        for item_id in xrange(start, end, 1):
            try:
                results = db.execute("select item_id from crawl_html where item_id=%s" % item_id)
                if results.rowcount > 0:
                    continue
            except:
                db = get_db_engine()
            crawl_item(item_id)

示例#2

0

显示文件

def crawl_all():
    login_params = {
        'emailaddress': '*****@*****.**',
        'password': '******',
        'type': 'undefined',
        'wbid': '0',
        'savestat': 'true'
        #    'checkcode':'',
    }
    req = urllib2.Request(
        'http://www.meilishuo.com/users/ajax_logon?frm=undefined',
        urllib.urlencode(login_params), headers)
    handle = urllib2.urlopen(req)
    logger.info("logged result %s", handle.read())

    if FLAGS.itemid:
        crawl_item(FLAGS.itemid)
    else:
        if FLAGS.group:
            start = FLAGS.group * 1000000
            end = (FLAGS.group + 1) * 1000000
        else:
            start = FLAGS.start
            end = FLAGS.end
        db = get_db_engine()
        for item_id in xrange(start, end, 1):
            try:
                results = db.execute(
                    "select item_id from crawl_html where item_id=%s" %
                    item_id)
                if results.rowcount > 0:
                    continue
            except:
                db = get_db_engine()
            crawl_item(item_id)

示例#3

0

显示文件

文件： import_shop_to_production.py 项目： iloveyo123u1/tb-crawler

def convert_main():
    db = get_db_engine()
    db_production = get_db_engine(connstr=FLAGS.production_connstr)
    all_nicks = db_production.execute("select nick from shop");
    all_nick_set = set([row[0] for row in all_nicks])
    result = db.execute("select url, name from shop_shop where is_voted=1 and is_cloth=1 and is_delete=0;")
    for row in result:
        if row[0].find("tmall.com") > 0:
            shop_type = 2
        else:
            shop_type = 1
        if row[1] not in all_nick_set:
            db_production.execute("insert into shop(nick, url, type, status) values(%s, %s ,%s, 2)", row[1], row[0], shop_type)
        else:
            print row[1].encode('utf8'), " exists"

示例#4

0

显示文件

def crawl_shops(sql):
    db = get_db_engine()
    shops = list(db.execute(sql))

    if not shops:
        logger.info("not shop crawler.")
        return

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    # global, all shop use
    tb_category = TaobaoCategory(db)
    term_factory = TermFactory(db)
    logger.info("init category %s and term factory %s.",
                len(tb_category.categories_dict), len(term_factory.sub_terms))

    last_time = 0
    for shop in shops:
        cur = time.time() * 1000
        if cur - last_time < FLAGS.interval:
            time.sleep((FLAGS.interval - (cur - last_time)) / 1000.0)
        last_time = time.time() * 1000
        crawl_one_shop(shop, tb_category, term_factory, db)

示例#5

0

显示文件

文件： update_item_title_image.py 项目： iloveyo123u1/tb-crawler

def check_image():
    #因为数据量比较大，分批检查
    db_limit = {
        1: 100000,
        100000: 200000,
        200000: 300000,
        300000: 400000,
        400000: 500000
    }
    n = 0
    for s, e in db_limit.items():
        sql = "select id, num_id, shop_id, pic_url, local_pic_url from item where status=1 limit %s,%s" % (s, e)
        db = get_db_engine()
        items = list(db.execute(sql))
        for item in items:
            item_id = item[0]
            item_iid = str(item[1])
            shop_id = item[2]
            pic_url = str(item[3])
            local_pic_url = str(item[4])
            validate_path = "/space/wwwroot/image.guang.j.cn/ROOT/images/" + str(shop_id) + "/big/" + local_pic_url
            if not os.path.exists(validate_path):
                n += 1
                logger.error("item %s not pic %s", item_id, validate_path)
                try:
                    download_image({'item_id': item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': pic_url,
                                    'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                except:
                    logger.error("download %s:%s failed reason: %s", item_id, pic_url, traceback.format_exc())
                    continue
    logger.info("stat item not image number=%s", n)

示例#6

0

显示文件

def load_click_items(numid2volumeprice):
    logger.info("Loading click items")
    click_items = []
    paid_items = []
    click_item_type = namedtuple(
        "ClickItemType",
        'click_hash item_id click_time click_ip area_code click_price click_volume item_price item_volume shop_nick taobao_report_id num_id'
    )

    db = get_db_engine()
    where = "click_time>='%s' and click_time<='%s'" % (datestr(
        FLAGS.start), datestr(FLAGS.end))
    if FLAGS.limit > 0:
        where += " limit %s" % FLAGS.limit
    sql = "select outer_code,item_id,click_time,click_ip,click_area,click_price,click_volume,item.price,item.volume,shop.nick,click_item_log.taobao_report_id,item.num_id from click_item_log left join item on click_item_log.item_id=item.id left join shop on shop.id=item.shop_id where %s" % where
    logger.debug("fetching %s", sql)
    results = db.execute(sql)
    progress = 0
    item_matched = 0
    logger.info("Processing click items %s", results.rowcount)
    price_diffs = 0
    for line in results:
        progress += 1
        click_item = click_item_type(*line)
        if not click_item.num_id:
            logger.warn("no numid %s", click_item)
            continue
        click_items.append(click_item)
        if click_item.item_id > 0:
            item_matched += 1
        volume = click_item.item_volume
        if not volume or volume == 0:
            logger.warn("item %s abnormal %s", click_item.item_id, volume)
            volume = 0.2
        elif volume > 800:
            volume = 800

        price = click_item.click_price
        if click_item.item_price and price > click_item.item_price * 1.5:
            price = click_item.item_price
            price_diffs += 1
            logger.warn("Price diff paid? %s %s/%s too much %s - %s",
                        click_item.taobao_report_id, price_diffs,
                        results.rowcount, click_item.click_price,
                        click_item.item_price)
        if price > 500.0:
            price = 500.0
        if not price or price < 0.5:
            logger.warn("price %s abnormal %s", click_item.item_id, price)
            price = 1.0

        numid2volumeprice[long(click_item.num_id)] = {
            'volume': volume,
            'price': price
        }
        if click_item.taobao_report_id:
            paid_items.append(click_item.taobao_report_id)
    logger.info("Total click %s item matched %s", len(click_items),
                item_matched)
    return click_items, paid_items

示例#7

0

显示文件

文件： estimate_click2pay.py 项目： iloveyo123u1/tb-crawler

def load_click_items(numid2volume):
    logger.info("Loading click items")
    db = get_db_engine()
    json_file = open(FLAGS.click_input)
    click_json = simplejson.load(json_file)
    click_item_type = namedtuple("ClickItemType", 'click_hash source media_id holder_id site admember_id campaign_id adgroup_id creative_id click_time click_ip area_code lpid price pubcat_list user_attr_list score item_price item_volume')
    click_items = []
    creative_matched = 0
    outercode_matched = 0
    progress = 0
    creative2item_cache = {}
    logger.info("Processing click items")
    for line in click_json:
        progress += 1
        click_item = click_item_type(*line)
        click_items.append(click_item)
        if creative2item_cache.has_key(click_item.creative_id):
            rr = creative2item_cache[click_item.creative_id]
        else:
            # creative_id --> (num_id, shop_name) item_price, item_volume
            r = db.execute("select num_id, shop.nick from item,shop where item.shop_id=shop.id and item.uctrac_creative_id=%s" % click_item.creative_id)
            if not r.rowcount:
                logger.warn("creative not matched %s %s/%s", click_item.creative_id, progress, len(click_json))
                continue
            rr = creative2item_cache[click_item.creative_id] = list(r)
        creative_matched += 1
        num_id, seller_nick = rr[0]
        #import pdb; pdb.set_trace()
        numid2volume[long(num_id)] = click_item.item_volume
        click_hash = 'jn%s' % click_item.click_hash
        r2 = db.execute('select 1 from taobao_report where outer_code="%s"' % click_hash)
        if r2.rowcount:
            outercode_matched += 1
    logger.info("Total click %s creative matched %s outercode matched %s", len(click_items), creative_matched, outercode_matched)
    return click_items

示例#8

0

显示文件

def crawl_items(sql):
    db = get_db_engine()

    items = db.execute(sql)
    logger.info("crawling image total %s", items.rowcount)
    if not items.rowcount:
        return
    if FLAGS.parallel:
        mapper = SimpleMapReduce(crawl_item2, identity)
        results = mapper(transform_args(items))
        logger.info("crawl finished %s", len(results))
    else:
        for item in items:
            crawl_item2({
                'item': item,
                'is_commit': FLAGS.commit,
                'crawl_path': FLAGS.crawl_path,
                'server_path': FLAGS.path,
                'is_remove': FLAGS.removetmp,
                'org_server_path': FLAGS.org_path,
                'dbuser': FLAGS.dbuser,
                'dbpasswd': FLAGS.dbpasswd,
                'dbhost': FLAGS.dbhost,
                'dbport': FLAGS.dbport,
                'db': FLAGS.db,
                'echosql': FLAGS.echosql,
                'statshost': FLAGS.statshost,
                'statsport': FLAGS.statsport
            })

示例#9

0

显示文件

文件： process_item_image.py 项目： iloveyo123u1/tb-crawler

def process_item(item, total, cur):
    try:
        id,shop_id,local_pic_url,pic_url,manual_set,manual_updated_columns,status,num_id,pic_height,pic_width = item
        big_path = "%s%s/big/%s" % (FLAGS.path, shop_id, local_pic_url)
        mid2_path = "%s%s/mid2/%s" % (FLAGS.path, shop_id, local_pic_url)
        mid_path = "%s%s/mid/%s" % (FLAGS.path, shop_id, local_pic_url)
        sma_path = "%s%s/sma/%s" % (FLAGS.path, shop_id, local_pic_url)
        if os.path.exists(big_path) and pic_width == 0:
            size = get_image_size(big_path)
            logger.debug("update %s size %s" % (id, size))
            db = get_db_engine()
            db.execute("update item set pic_width=%s,pic_height=%s where id=%s" % (size[0], size[1], id))

        if status in (2,3) and not FLAGS.force:
            return
        if not os.path.exists(big_path):
            headers = {'Referer' : "http://item.taobao.com/item.htm?id=%s" % id, 'User-Agent' : DEFAULT_UA}
            data = crawl_page(num_id, pic_url, headers)
            # save to path
            logger.debug("crawling %s %s %s %s", cur, total, big_path, item)
            save_image(big_path, data)
        if not os.path.exists(mid2_path):
            logger.debug("thumbing %s %s %s %s", cur, total, mid2_path, item)
            imagemagick_resize(300, 300, big_path, mid2_path)
        if not os.path.exists(mid_path):
            logger.debug("thumbing %s %s", mid_path, item)
            imagemagick_resize(210, 210, big_path, mid_path)
        if not os.path.exists(sma_path):
            logger.debug("thumbing %s %s", sma_path, item)
            imagemagick_resize(60, 60, big_path, sma_path)
    except:
        logger.error("unknown error %s, %s", item, traceback.format_exc())

示例#10

0

显示文件

def update_item(sql):
    t = time.time()
    db = get_db_engine()
    item = db.execute(sql)

    results = get_taobao_items(get_top(), item, fn_join_iids=lambda
            x:','.join([str(i[1]) for i in x]), calllimit=60)

    for batch_item in results:
        for iid, item in batch_item.items.iteritems():
            try:
                item_id = item['req'][0]
                item_iid = item['req'][1]
                shop_id = item['req'][2]
                item_title = item['req'][3]
                item_picurl = item['req'][4]
                local_pic_url = item['req'][5]  #直接用数据库的文件名,不更新,类似"18142957186_28924096.jpg"

                if item['resp']:
                    taobao_title = item['resp']['title']
                    taobao_picurl = item['resp']['pic_url']
                    #item_picurl != taobao_picurl,则需要重新获取，并存入dfs，再更新item
                    #title, pic_url, pic_width, pic_height, modified

                    if FLAGS.forcibly:
                        #强制更新
                        is_title_update = True
                        is_picurl_update = True
                        # local_pic_url = "%s_%s.%s" % (item_iid, str(id(item)), item_picurl.split('.')[-1].split('?')[0].split('/')[-1])
                    else:
                        if cmp(item_title, taobao_title):
                            is_title_update = True
                        else:
                            is_title_update = False

                        if cmp(item_picurl, taobao_picurl):
                            is_picurl_update = True
                        else:
                            is_picurl_update = False

                    if is_title_update:
                        if is_picurl_update:
                            width, height = download_image({'item_id': item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                            db.execute("update item set modified=now(), title=%s, pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_title, taobao_picurl, width, height, item_id)

                            logger.info("item %s num_id %s update title from %s to %s, pic_url from %s to %s", item_id, item_iid, item_title, taobao_title, item_picurl, taobao_picurl)
                        else:
                            db.execute("update item set modified=now(), title=%s where id=%s", taobao_title, item_id)

                            logger.info("item %s update title from %s to %s", item_id, item_title, taobao_title)
                    elif is_picurl_update:
                        width, height = download_image({'item_id':item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                        db.execute("update item set modified=now(), pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_picurl, width, height, item_id)

                        logger.info("item %s num_id %s update pic_url from %s to %s", item_id, item_iid, item_picurl, taobao_picurl)

            except:
                logger.error("update failed %s", traceback.format_exc())
    spent = time.time() - t
    logger.info("update_item_title_image use time : %s", spent*1000)

示例#11

0

显示文件

文件： crawl_taobao.py 项目： ljb-2000/tb-crawler

def crawl_shops(sql):
    db = get_db_engine()
    shops = list(db.execute(sql))

    if not shops:
        logger.info("not shop crawler.")
        return

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    # global, all shop use
    tb_category = TaobaoCategory(db)
    term_factory = TermFactory(db)
    logger.info("init category %s and term factory %s.", len(tb_category.categories_dict), len(term_factory.sub_terms))

    last_time = 0
    for shop in shops:
        cur = time.time() * 1000
        if cur - last_time < FLAGS.interval:
            time.sleep((FLAGS.interval - (cur - last_time)) / 1000.0)
        last_time = time.time() * 1000
        crawl_one_shop(shop, tb_category, term_factory, db)

示例#12

0

显示文件

文件： generate_table.py 项目： iloveyo123u1/tb-crawler

def mig_main():
    db = get_db_engine()
    result = db.execute("select id,name,status from wb_account;")
    for row in result:
        sql = "update wb_qq_account set qqid=%s where name='%s'" % (QQIDS[row[1]], row[1])
        print sql
        db.execute(sql)

示例#13

0

显示文件

文件： update_item_title_image.py 项目： iloveyo123u1/tb-crawler

def update_item(sql):
    t = time.time()
    db = get_db_engine()
    item = db.execute(sql)

    results = get_taobao_items(get_top(), item, fn_join_iids=lambda
            x:','.join([str(i[1]) for i in x]), calllimit=60)

    for batch_item in results:
        for iid, item in batch_item.items.iteritems():
            try:
                item_id = item['req'][0]
                item_iid = item['req'][1]
                shop_id = item['req'][2]
                item_title = item['req'][3]
                item_picurl = item['req'][4]
                local_pic_url = item['req'][5]  #直接用数据库的文件名,不更新,类似"18142957186_28924096.jpg"

                if item['resp']:
                    taobao_title = item['resp']['title']
                    taobao_picurl = item['resp']['pic_url']
                    #item_picurl != taobao_picurl,则需要重新获取，并存入dfs，再更新item
                    #title, pic_url, pic_width, pic_height, modified

                    if FLAGS.forcibly:
                        #强制更新
                        is_title_update = True
                        is_picurl_update = True
                        # local_pic_url = "%s_%s.%s" % (item_iid, str(id(item)), item_picurl.split('.')[-1].split('?')[0].split('/')[-1])
                    else:
                        if cmp(item_title, taobao_title):
                            is_title_update = True
                        else:
                            is_title_update = False

                        if cmp(item_picurl, taobao_picurl):
                            is_picurl_update = True
                        else:
                            is_picurl_update = False

                    if is_title_update:
                        if is_picurl_update:
                            width, height = download_image({'item_id': item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                            db.execute("update item set modified=now(), title=%s, pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_title, taobao_picurl, width, height, item_id)

                            logger.info("item %s num_id %s update title from %s to %s, pic_url from %s to %s", item_id, item_iid, item_title, taobao_title, item_picurl, taobao_picurl)
                        else:
                            db.execute("update item set modified=now(), title=%s where id=%s", taobao_title, item_id)

                            logger.info("item %s update title from %s to %s", item_id, item_title, taobao_title)
                    elif is_picurl_update:
                        width, height = download_image({'item_id':item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                        db.execute("update item set modified=now(), pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_picurl, width, height, item_id)

                        logger.info("item %s num_id %s update pic_url from %s to %s", item_id, item_iid, item_picurl, taobao_picurl)

            except:
                logger.error("update failed %s", traceback.format_exc())
    spent = time.time() - t
    logger.info("update_item_title_image use time : %s", spent*1000)

示例#14

0

显示文件

文件： crawl_item_impl.py 项目： iloveyo123u1/tb-crawler

def crawl_hotest():
    #查出bi-db1中所有的item_hotest表item_id数据，这个表应该是每小时更新一次
    #写入一个临时表temp_item_hotest,写入前先删除旧数据
    #联合查询item,temp_item_hotest表，进行抓取评论,最多抓取20页
    bi_db = get_db_engine(dbhost=FLAGS.bihost) 
    itemid_list = list(bi_db.execute("select item_id from item_hotest"))

    db = get_db_engine()
    db.execute("TRUNCATE table temp_item_hotest")
    logger.debug("TRUNCATE table temp_item_hotest")
    db.execute("insert into temp_item_hotest values (%s)", itemid_list)
    logger.debug("insert temp_item_hotest")
    if FLAGS.force:
        return crawl_items("select item.id,item.detail_url,item.num_id from item,temp_item_hotest where item.id=temp_item_hotest.item_id")
    else:
        return crawl_items("select item.id,item.detail_url,item.num_id from item,temp_item_hotest where item.status=1 and item.id=temp_item_hotest.item_id order by item.id desc")

示例#15

0

显示文件

文件： clean_cdn_cache.py 项目： iloveyo123u1/tb-crawler

def get_data():
    sql = "select shop_id,local_pic_url from item where modified>'2013-12-09 09' order by shop_id desc"
    db = get_db_engine()
    items = list(db.execute(sql))
    for item in items:
        refreshCdnCache(item[0], item[1])
        time.sleep(1)

示例#16

0

显示文件

def get_data():
    sql = "select shop_id,local_pic_url from item where modified>'2013-12-09 09' order by shop_id desc"
    db = get_db_engine()
    items = list(db.execute(sql))
    for item in items:
        refreshCdnCache(item[0], item[1])
        time.sleep(1)

示例#17

0

显示文件

def clicklog_main():
    click_file_list = []
    for d in eachday(FLAGS.start, FLAGS.end):
        click_file_list.extend(glob("/space/log/filtered/click*/click-" + datestr(d) + "_00???"))
    # TODO: load from conversion db?
    ret = []
    if FLAGS.commit:
        db = get_db_engine()
    for fn in click_file_list:
        logger.debug("processing %s", fn)
        for line in open(fn, "r"):
            click = get_click(line)
            if not click:
                continue
            click_obj, click_ex_obj, score, why = click
            rec   = get_record(click)
            #if rec[0] in written:
            #    continue #already written in db.
            if rec:
                if FLAGS.commit:
                    insert_match(db, rec)
                else:
                    ret.append(rec)
    simplejson.dump(ret, open(FLAGS.out_file, "w"))
    return ret

示例#18

0

显示文件

文件： view.py 项目： iloveyo123u1/tb-crawler

    def GET(self, id):
        db = get_db_engine()
        results = db.execute("select crawl_item_images.url, crawl_item_images.pos, crawl_item_images.type from crawl_html, crawl_item_images where crawl_item_images.item_id=crawl_html.item_id and crawl_html.item_id=%s;" % id)
        item_crawler = ItemCrawler(id, FLAGS.crawl_path)
        item_crawler.crawl(results, ((94,94), (350,350)), False)

        return render.crawlitem(id, item_crawler.results)

示例#19

0

显示文件

def db_validate():
    db = get_db_engine()
    items = list(
        db.execute(
            "select shop_id,local_pic_url from item where status=1 and crawl_status=2 and created between CURDATE()-interval 7 day and CURDATE()"
        ))
    i = 0
    for item in items:
        shop_id = item[0]
        local_pic_url = item[1]
        if shop_id == 5:
            small4_path = "/space/wwwroot/image.guang.j.cn/ROOT/images_1/%s/small4/%s" % (
                shop_id, local_pic_url)
        else:
            small4_path = "/space/wwwroot/image.guang.j.cn/ROOT/images/%s/small4/%s" % (
                shop_id, local_pic_url)
        if not os.path.isfile(small4_path):
            i += 1
            try:
                big_path = small4_path.replace("/small4/", "/big/")
                image = Image.open(big_path)
                width, height = image.size
                convert_img(big_path, small4_path, width, height)

                logger.info("%s:%s", i, small4_path)
            except IOError, e:
                logger.error("Open image failed %s:%s %s", i, big_path,
                             e.message)
                continue

示例#20

0

显示文件

文件： check_taobao_shop.py 项目： iloveyo123u1/tb-crawler

def crawl_main():
    hosts = set()
    hosts_in_db = set()
    hosts_attr = {}

    db = get_db_engine()
    result = db.execute("select url from shop")

    for row in result:
        hosts_in_db.add(str(urlparse.urlparse(row[0]).netloc))

    #print hosts_in_db
    for line in open(FLAGS.path):
        url = line.split()[0]
        host = str(urlparse.urlparse(url).netloc)
        hosts.add(host)
        if url.find('tmall.com') > 0:
            shopname = " ".join(line.split()[1:])
        else:
            shopname = " ".join(line.split()[1:-1])
        hosts_attr[host] = shopname

    hosts_not_in_db = hosts - hosts_in_db
    print "hosts %s indb %s notindb %s" % (len(hosts), len(hosts_in_db), len(hosts_not_in_db))
    for host in hosts_not_in_db:
        print "http://%s/ %s" % (host, hosts_attr[host])

示例#21

0

显示文件

def img_update():
    sql = "select id,num_id,shop_id,pic_url,local_pic_url from item where pic_url like '%%q90.%%'"
    db = get_db_engine()
    items = db.execute(sql)
    tr = re.compile("(.+\.(jpg|png))[^$]*.jpg$")
    for item in items:
        taobao_picurl = item[3]
        taobao_picurl = tr.sub(r'\1', taobao_picurl)
        try:
            width, height = download_image({
                'item_id': item[0],
                'num_id': item[1],
                'shop_id': item[2],
                'pic_url': taobao_picurl,
                'image_name': item[4],
                'crawl_path': FLAGS.crawl_path
            })
            db.execute(
                "update item set modified=now(), pic_url=%s, pic_width=%s, pic_height=%s where id=%s",
                taobao_picurl, width, height, item[0])
            logger.info("item %s update image ok", item[0])
        except:
            logger.error("download %s:%s failed reason %s", item[0],
                         taobao_picurl, traceback.format_exc())
            continue

示例#22

0

显示文件

def crawl_main():
    hosts = set()
    hosts_in_db = set()
    hosts_attr = {}

    db = get_db_engine()
    result = db.execute("select url from shop")

    for row in result:
        hosts_in_db.add(str(urlparse.urlparse(row[0]).netloc))

    #print hosts_in_db
    for line in open(FLAGS.path):
        url = line.split()[0]
        host = str(urlparse.urlparse(url).netloc)
        hosts.add(host)
        if url.find('tmall.com') > 0:
            shopname = " ".join(line.split()[1:])
        else:
            shopname = " ".join(line.split()[1:-1])
        hosts_attr[host] = shopname

    hosts_not_in_db = hosts - hosts_in_db
    print "hosts %s indb %s notindb %s" % (len(hosts), len(hosts_in_db),
                                           len(hosts_not_in_db))
    for host in hosts_not_in_db:
        print "http://%s/ %s" % (host, hosts_attr[host])

示例#23

0

显示文件

def get_xks_tagmatch(xks):
    tagmatch = ''
    if xks:
        db = get_db_engine()
        rows = db.execute("SELECT tag_match FROM recommend_subscriber WHERE id = %s" % xks)
        if rows.rowcount > 0:
            tagmatch = convert_tagmatch(list(rows)[0][0])
    return tagmatch

示例#24

0

显示文件

def crawler(sql):
    db = get_db_engine()
    items = list(db.execute(sql))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    for item in items:
        shop_id = item[0]
        shop_type = item[1]
        item_id = item[2]
        url = item[3]

        try:
            htm = get_item_htm(item_id, url, db)
            if shop_type == 1:
                htm_obj = parse_html(htm, encoding='gb18030')
                discount_url = htm_obj.xpath("//div[@id='promote']/@data-default")
                if discount_url and len(discount_url) > 0:
                    item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
                    disc_content = download(discount_url[0], item_headers)
                    if disc_content.strip():
                        disc_obj = parse_html(disc_content, encoding='gb18030')
                        content = disc_obj.xpath("//div[@id='J_MjsData']/h3/text()")[0].strip()
                        dates = disc_obj.xpath("//div[@id='J_MjsData']/h3/span[@class='tb-indate']/text()")[0].strip()
                        st = dates.encode('utf-8').replace("--","—").split("—")
                        start_time = datetime.datetime.strptime(st[0].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d')
                        end_time = datetime.datetime.strptime(st[1].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d')

                        db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())",
                                   shop_id, content.encode('utf-8'), start_time, end_time, discount_url[0])
                        logger.info("taobao shop %s get discount success", shop_id)
                    else:
                        logger.warning("taobao shop %s:%s not discount.", shop_id, url)
                else:
                    logger.warning("taobao shop %s:%s not discount.", shop_id, url)
            elif shop_type == 2:
                d_url = get_val(htm, "initApi")
                if d_url:
                    item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
                    disc_content = download(d_url, item_headers)
                    cjson = loads(disc_content.decode('gb18030').encode('utf8'))
                    shop_prom = cjson['defaultModel']['itemPriceResultDO']['tmallShopProm']
                    if shop_prom:
                        st = int(shop_prom['startTime'])/1000
                        et = int(shop_prom['endTime'])/1000
                        start_time = time.strftime("%Y-%m-%d", time.localtime(st))
                        end_time = time.strftime("%Y-%m-%d", time.localtime(et))
                        content = shop_prom['promPlan'][0]['msg']
                        db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())",
                            shop_id, content.encode('utf-8'), start_time, end_time, d_url)
                        logger.info("tmall shop %s get discount success", shop_id)
                    else:
                        logger.warning("taobao shop %s:%s not discount.", shop_id, url)
        except:
            logger.error("shop %s:%s xpath failed:%s", shop_id, url, traceback.format_exc())

示例#25

0

显示文件

文件： process_taobaoke.py 项目： iloveyo123u1/tb-crawler

def do_all(fn):
    db = get_db_engine()

    where_sql = " %s" % (FLAGS.where)
    results = db.execute("select id from shop where type < 3 and %s" % where_sql)

    for result in results:
        fn(result[0], db)
        time.sleep(1.0)

示例#26

0

显示文件

    def GET(self, id):
        db = get_db_engine()
        results = db.execute(
            "select crawl_item_images.url, crawl_item_images.pos, crawl_item_images.type from crawl_html, crawl_item_images where crawl_item_images.item_id=crawl_html.item_id and crawl_html.item_id=%s;"
            % id)
        item_crawler = ItemCrawler(id, FLAGS.crawl_path)
        item_crawler.crawl(results, ((94, 94), (350, 350)), False)

        return render.crawlitem(id, item_crawler.results)

示例#27

0

显示文件

def convert_main():
    db = get_db_engine()
    db_production = get_db_engine(connstr=FLAGS.production_connstr)
    all_nicks = db_production.execute("select nick from shop")
    all_nick_set = set([row[0] for row in all_nicks])
    result = db.execute(
        "select url, name from shop_shop where is_voted=1 and is_cloth=1 and is_delete=0;"
    )
    for row in result:
        if row[0].find("tmall.com") > 0:
            shop_type = 2
        else:
            shop_type = 1
        if row[1] not in all_nick_set:
            db_production.execute(
                "insert into shop(nick, url, type, status) values(%s, %s ,%s, 2)",
                row[1], row[0], shop_type)
        else:
            print row[1].encode('utf8'), " exists"

示例#28

0

显示文件

文件： process_item_image.py 项目： iloveyo123u1/tb-crawler

def process_all_items():
    db = get_db_engine()

    last_time = 0
    sql = "select id,shop_id,local_pic_url,pic_url,manual_set,manual_updated_columns,status,num_id,pic_height,pic_width from item " + FLAGS.sql
    items = db.execute(sql)
    i = 0
    for item in items:
        i += 1
        process_item(item, items.rowcount, i)

示例#29

0

显示文件

def do_all(fn):
    db = get_db_engine()

    where_sql = " %s" % (FLAGS.where)
    results = db.execute("select id from shop where type < 3 and %s" %
                         where_sql)

    for result in results:
        fn(result[0], db)
        time.sleep(1.0)

示例#30

0

显示文件

def crawl_num():
    db = get_db_engine()
    if FLAGS.force:
        return crawl_items(
            "select item.id,item.detail_url,item.num_id from item,shop where item.shop_id=shop.id and shop.type!=3 and item.num_id=%s"
            % FLAGS.numid)
    else:
        return crawl_items(
            "select item.id,item.detail_url,item.num_id from item,shop where item.shop_id=shop.id and shop.type!=3 and item.status=1 and item.num_id=%s"
            % FLAGS.numid)

示例#31

0

显示文件

def process_all_items():
    db = get_db_engine()

    last_time = 0
    sql = "select id,shop_id,local_pic_url,pic_url,manual_set,manual_updated_columns,status,num_id,pic_height,pic_width from item " + FLAGS.sql
    items = db.execute(sql)
    i = 0
    for item in items:
        i += 1
        process_item(item, items.rowcount, i)

示例#32

0

显示文件

文件： crawl_shop_basic_info.py 项目： iloveyo123u1/tb-crawler

def crawl_shops(sql_filter):

    sql_template = '''
select s.id as shop_id
, s.type as shop_type
, s.url as shop_url
, i.id as first_item_id
, h.id as item_html_id
, h.html as item_html
from
(
    select max(i.id) as item_id , i.shop_id FROM item i
    inner join crawl_html h on i.status = 1 and i.crawl_status = 2 and i.id = h.item_id
    group by i.shop_id
) sni
inner join item i on sni.item_id = i.id
inner join crawl_html h on h.item_id = i.id
inner join shop s on i.shop_id = s.id
where
'''
    sql = sql_template + sql_filter + ';'

    db_shop = get_db_engine()
    shops = db_shop.execute(sql)

    if not shops.returns_rows:
        logger.info("no shops to be crawled.")
        return

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    db = get_db_engine()
    last_time = 0
    for shop in shops:
        cur = time.time() * 1000
        if cur - last_time < FLAGS.interval:
            time.sleep((FLAGS.interval - (cur - last_time)) / 1000.0)
        last_time = time.time() * 1000
        crawl_one_shop(shop, db)

示例#33

0

显示文件

def crawl_shops(sql_filter):

    sql_template = '''
select s.id as shop_id
, s.type as shop_type
, s.url as shop_url
, i.id as first_item_id
, h.id as item_html_id
, h.html as item_html
from
(
    select max(i.id) as item_id , i.shop_id FROM item i
    inner join crawl_html h on i.status = 1 and i.crawl_status = 2 and i.id = h.item_id
    group by i.shop_id
) sni
inner join item i on sni.item_id = i.id
inner join crawl_html h on h.item_id = i.id
inner join shop s on i.shop_id = s.id
where
'''
    sql = sql_template + sql_filter + ';'

    db_shop = get_db_engine()
    shops = db_shop.execute(sql)

    if not shops.returns_rows:
        logger.info("no shops to be crawled.")
        return

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    db = get_db_engine()
    last_time = 0
    for shop in shops:
        cur = time.time() * 1000
        if cur - last_time < FLAGS.interval:
            time.sleep((FLAGS.interval - (cur - last_time)) / 1000.0)
        last_time = time.time() * 1000
        crawl_one_shop(shop, db)

示例#34

0

显示文件

def update_shop_level(sql):
    db = get_db_engine()
    shops = db.execute(sql)
    failed = []
    for shop in shops:
        process_shop(db, shop, failed)
    results = "Update shop's level, Checked result, total %s failed %s, detailed %s" % (shops.rowcount, len(failed), ",".join(map(str, failed)))
    if len(failed):
        logger.warn(results)
    else:
        logger.info(results)

示例#35

0

显示文件

def crawl_hotest():
    #查出bi-db1中所有的item_hotest表item_id数据，这个表应该是每小时更新一次
    #写入一个临时表temp_item_hotest,写入前先删除旧数据
    #联合查询item,temp_item_hotest表，进行抓取评论,最多抓取20页
    bi_db = get_db_engine(dbhost=FLAGS.bihost)
    itemid_list = list(bi_db.execute("select item_id from item_hotest"))

    db = get_db_engine()
    db.execute("TRUNCATE table temp_item_hotest")
    logger.debug("TRUNCATE table temp_item_hotest")
    db.execute("insert into temp_item_hotest values (%s)", itemid_list)
    logger.debug("insert temp_item_hotest")
    if FLAGS.force:
        return crawl_items(
            "select item.id,item.detail_url,item.num_id from item,temp_item_hotest where item.id=temp_item_hotest.item_id"
        )
    else:
        return crawl_items(
            "select item.id,item.detail_url,item.num_id from item,temp_item_hotest where item.status=1 and item.id=temp_item_hotest.item_id order by item.id desc"
        )

示例#36

0

显示文件

def replace_main():
    now_time = datetime.datetime.now()
    front_time = now_time - datetime.timedelta(hours=1)
    number = FLAGS.number  # 默认取详情页第一张图片

    fdfs_client = Fdfs_client('/etc/fdfs/client.conf')
    db = get_db_engine()

    if FLAGS.itemid > 0:
        item_sql = "select id,shop_id,local_pic_url from item where id=%s and status=1" % FLAGS.itemid
    elif FLAGS.shopid > 0:
        item_sql = "select id,shop_id,local_pic_url from item where shop_id=%s and crawl_status=2 and status=1" % FLAGS.shopid
    elif FLAGS.all:
        for shop, num in SHOP_NUM.items():
            number = num
            item_sql = "select id,shop_id,local_pic_url from item where shop_id=%s and crawl_status=2 and status=1 and created>'%s'" % (
                shop, front_time)

    items = list(db.execute(item_sql))
    logger.info("replace main image total %s", len(items))
    i = 1
    for item in items:
        item_id = item[0]
        shop_id = item[1]
        local_pic_url = item[2]

        # 一定要使用pos排序
        image_sql = "select item_id,fastdfs_filename,pos from item_images where type=2 and item_id=%s order by pos limit %s,1" % (
            item_id, number - 1)
        image_item = list(db.execute(image_sql))
        try:
            if len(image_item) > 0 and image_item[0][0] is not None:
                fastdfs_filename = str(image_item[0][1])
            else:
                fastdfs_filename = "http://image2.guang.j.cn/images/%s/big/%s" % (
                    shop_id, local_pic_url)
            download_image(fastdfs_filename, shop_id, item_id, local_pic_url,
                           fdfs_client)
        except:
            logger.error("download %s:%s failed reason: %s", item_id,
                         fastdfs_filename, traceback.format_exc())
            continue

        try:
            refreshCdnCache(shop_id, local_pic_url)
        except:
            logger.error("refreshCdnCache %s:%s failed: %s", item_id,
                         local_pic_url, traceback.format_exc())
            continue
        logger.info("%s/%s replace item %s main image success %s", i,
                    len(items), item_id, local_pic_url)
        i += 1

示例#37

0

显示文件

文件： crawl_meilishuo.py 项目： iloveyo123u1/tb-crawler

def crawl_item(item_id):
    try:
        url = "http://www.meilishuo.com/share/%s" % item_id
        data = crawl_page(item_id, url, headers)
        if not data:
            return
        try:
            html_obj = etree.XML(data)
        except:
            try:
                html_obj = soupparser.fromstring(data.decode('utf8'))
            except:
                try:
                    html_obj = etree.HTML(data)
                except:
                    logger.warn("crawling %s len %s parse failed %s", item_id, len(data), traceback.format_exc(), extra={'tags':['crawlItemParseException',]})

        #saved_data = etree.tostring(html_obj.xpath("//div[@id='main']/div/div/div")[0])
        detail_path = html_obj.xpath("//div[@id='main']/div/div/div")
        if not detail_path:
            logger.info("err parse %s len %s", item_id, len(data))
            return
        detail_obj = detail_path[0]

        results = {}
        results['user_url'] = get_obj(detail_obj, "div/dl/dt/a/@href")
        results['user_name'] = get_obj(detail_obj, "div/dl/dd[1]/a/text()")
        results['obj_date'] = get_obj(detail_obj, "div/dl/dd/span/text()")

        results['obj_url'] = get_obj(detail_obj, "div/div/div/p[1]/a/@href")
        results['obj_title'] = get_obj(detail_obj, "div/div/div/p[1]/a/text()")
        results['obj_img'] = get_obj(detail_obj, "div/div/a/img/@src")
        results['obj_fav_count'] = get_obj(detail_obj, "div/div/div/p[2]/a/b/text()")
        results['obj_org_img'] = get_obj(detail_obj, "div/div[@class='original_pic_ioc']/a/@href")
        results['obj_comment_count'] = get_obj(detail_obj, "div/div/div/a/b/text()")
        results['obj_price'] = get_obj(detail_obj, "div/div/div/div/p/text()")

        results['group_title'] = get_obj(detail_obj, "div/dl/dd[1]/a/text()")
        results['group_url'] = get_obj(detail_obj, "div/dl/dd[1]/a/@href")
        results['group_desc'] = get_obj(detail_obj, "div/dl/dd[1]/text()")

        logger.debug("results %s", results)
        #import pdb; pdb.set_trace()

        db = get_db_engine()
        db.execute("delete from crawl_html where item_id=%s" % item_id)
        db.execute("insert into crawl_html (item_id,html) values (%s, %s)", item_id, simplejson.dumps(results))
        logger.info("crawled %s len %s", url, len(data))
    except KeyboardInterrupt:
        raise
    except:
        logger.warn("crawl failed %s exception %s", url, traceback.format_exc())

示例#38

0

显示文件

文件： cmbchina.py 项目： qiaohui/loan_crawler

def crawl():
    company_id = 19
    url = "https://efinance.cmbchinaucs.com/Handler/ActionPage.aspx?targetAction=GetProjectList_Index"
    headers = {
        'Host': "efinance.cmbchinaucs.com",
        'Connection': "keep-alive",
        'Content-Length': "33",
        'Cache-Control': "max-age=0",
        'Accept': "text/plain, */*",
        'Origin': "https://efinance.cmbchinaucs.com",
        'X-Requested-With': "XMLHttpRequest",
        'User-Agent':
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36",
        'Content-Type': "application/x-www-form-urlencoded",
        'Referer': "https://efinance.cmbchinaucs.com/",
        'Accept-Encoding': "gzip,deflate",
        'Accept-Language': "zh-CN,zh;q=0.8,en;q=0.6",
        'Cookie': "ASP.NET_SessionId=woqbxpemqp3kk4syvfbkxtzw"
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = post(url,
                        data={"targetAction": "GetProjectList_Index"},
                        headers=headers)
        loans_json = loads(loan_htm, encoding="UTF-8")
        print loans_json

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())

示例#39

0

显示文件

文件： update_shop_level.py 项目： iloveyo123u1/tb-crawler

def update_shop_level(sql):
    db = get_db_engine()
    update_sql = "update shop set level = '%s' where id = %s"
    shops = db.execute(sql)
    failed = []
    for shop in shops:
        #import pdb; pdb.set_trace()
        process_shop(db, shop, failed)
    results = "Update shop's level, Checked result, total %s failed %s, detailed %s" % (shops.rowcount, len(failed), ",".join(map(str, failed)))
    if len(failed):
        logger.warn(results)
    else:
        logger.info(results)

示例#40

0

显示文件

def update_shop_level(sql):
    db = get_db_engine()
    update_sql = "update shop set level = '%s' where id = %s"
    shops = db.execute(sql)
    failed = []
    for shop in shops:
        #import pdb; pdb.set_trace()
        process_shop(db, shop, failed)
    results = "Update shop's level, Checked result, total %s failed %s, detailed %s" % (
        shops.rowcount, len(failed), ",".join(map(str, failed)))
    if len(failed):
        logger.warn(results)
    else:
        logger.info(results)

示例#41

0

显示文件

文件： image_digest.py 项目： iloveyo123u1/tb-crawler

def main():
    db = get_db_engine()
    items = db.execute("select id, shop_id, local_pic_url, concat('/space/wwwroot/image.guang.j.cn/ROOT/images/', shop_id, '/mid2/', local_pic_url) as img_path from item where status=1 and %s order by id" % FLAGS.where)
    for item in items:
        img_path = item[3]
        if not os.path.exists(img_path) or img_path.endswith('.png'):
            logger.warn('skipping %s %s', item[0], item[3])
            continue
        try:
            logger.debug('processing %s %s', item[0], item[3])
            d = ','.join(map(str, pHash.image_digest(img_path, 1.0, 1.0, 180).coeffs))
            db.execute("insert ignore into item_image_digest (item_id, digest) values (%s, '%s')" % (item[0], d))
        except:
            pass

示例#42

0

显示文件

def fix_items(sql):
    db = get_db_engine()

    items = db.execute(sql)
    logger.info("Fixing image total %s", items.rowcount)
    if not items.rowcount:
        return
    if FLAGS.parallel:
        mapper = SimpleMapReduce(fix_item2, identity)
        results = mapper(transform_args(items))
        logger.info("fix finished %s", len(results))
    else:
        for item in items:
            fix_item2({'item':item, 'crawl_path':FLAGS.crawl_path, 'server_path':FLAGS.path, 'is_remove':FLAGS.removetmp, 'org_server_path':FLAGS.org_path})

示例#43

0

显示文件

文件： process_taobaoke.py 项目： iloveyo123u1/tb-crawler

def rollback_shop(shop_id, db):
    if not db:
        db = get_db_engine()

    sql = "select id,num_id,null,null from item where shop_id = %s and detail_url like '%%%%s.click.taobao.com%%%%'" % shop_id
    if FLAGS.limit:
        sql += " limit " + str(FLAGS.limit)
    results = db.connect().execute(sql)
    for i, result in enumerate(results):
        new_url = "http://item.taobao.com/item.htm?id=%s" % result[1]
        sql = "update item set detail_url=\'%s\' where id = %s" % (new_url, result[0])
        logger.debug("Run sql %s/%s: %s" % (i, results.rowcount, sql))
        if not FLAGS.dryrun:
            db.execute(sql)

示例#44

0

显示文件

def remove():
    db = get_db_engine()
    key = db.execute("select uniq_url from item_images where item_id=%s and fastdfs_filename=%s", FLAGS.itemid, FLAGS.dfsimg)
    if not key.rowcount > 0:
        return
    else:
        key = list(key)
    result = db.execute("select id from item_images where uniq_url=%s and disabled=0", key[0])
    i = 0
    for r in result:
        sql = "update item_images set disabled=1 where id=%s" % r[0]
        print("deleting %s/%s %s", i, result.rowcount, sql)
        db.execute(sql)
        i+=1

示例#45

0

显示文件

文件： estimate_click2pay2.py 项目： iloveyo123u1/tb-crawler

def load_click_items(numid2volumeprice):
    logger.info("Loading click items")
    click_items = []
    paid_items = []
    click_item_type = namedtuple("ClickItemType", 'click_hash item_id click_time click_ip area_code click_price click_volume item_price item_volume shop_nick taobao_report_id num_id')

    db = get_db_engine()
    where = "click_time>='%s' and click_time<='%s'" % (datestr(FLAGS.start), datestr(FLAGS.end))
    if FLAGS.limit > 0:
        where += " limit %s" % FLAGS.limit
    sql = "select outer_code,item_id,click_time,click_ip,click_area,click_price,click_volume,item.price,item.volume,shop.nick,click_item_log.taobao_report_id,item.num_id from click_item_log left join item on click_item_log.item_id=item.id left join shop on shop.id=item.shop_id where %s" % where
    logger.debug("fetching %s", sql)
    results = db.execute(sql)
    progress = 0
    item_matched = 0
    logger.info("Processing click items %s", results.rowcount)
    price_diffs = 0
    for line in results:
        progress += 1
        click_item = click_item_type(*line)
        if not click_item.num_id:
            logger.warn("no numid %s", click_item)
            continue
        click_items.append(click_item)
        if click_item.item_id > 0:
            item_matched += 1
        volume = click_item.item_volume
        if not volume or volume == 0:
            logger.warn("item %s abnormal %s", click_item.item_id, volume)
            volume = 0.2
        elif volume > 800:
            volume = 800

        price = click_item.click_price
        if click_item.item_price and price > click_item.item_price * 1.5:
            price = click_item.item_price
            price_diffs += 1
            logger.warn("Price diff paid? %s %s/%s too much %s - %s", click_item.taobao_report_id, price_diffs, results.rowcount, click_item.click_price, click_item.item_price)
        if price > 500.0:
            price = 500.0
        if not price or price < 0.5:
            logger.warn("price %s abnormal %s", click_item.item_id, price)
            price = 1.0

        numid2volumeprice[long(click_item.num_id)] = {'volume' : volume, 'price' : price}
        if click_item.taobao_report_id:
            paid_items.append(click_item.taobao_report_id)
    logger.info("Total click %s item matched %s", len(click_items), item_matched)
    return click_items, paid_items

示例#46

0

显示文件

文件： update_item_title_image.py 项目： iloveyo123u1/tb-crawler

def img_update():
    sql = "select id,num_id,shop_id,pic_url,local_pic_url from item where pic_url like '%%q90.%%'"
    db = get_db_engine()
    items = db.execute(sql)
    tr = re.compile("(.+\.(jpg|png))[^$]*.jpg$")
    for item in items:
        taobao_picurl = item[3]
        taobao_picurl = tr.sub(r'\1', taobao_picurl)
        try:
            width, height = download_image({'item_id':item[0], 'num_id': item[1], 'shop_id': item[2], 'pic_url': taobao_picurl, 'image_name': item[4], 'crawl_path': FLAGS.crawl_path})
            db.execute("update item set modified=now(), pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_picurl, width, height, item[0])
            logger.info("item %s update image ok", item[0])
        except:
            logger.error("download %s:%s failed reason %s", item[0], taobao_picurl, traceback.format_exc())
            continue

示例#47

0

显示文件

def remove():
    db = get_db_engine()
    content = open(FLAGS.file)
    j = 0
    for l in content:
        key = l.split('\t')
        print("querying %s", key[0])
        result = db.execute("select id from item_images where uniq_url=%s and disabled=0", key[0])
        i = 0
        j+=1
        for r in result:
            sql = "update item_images set disabled=1 where id=%s" % r[0]
            print("deleting %s %s/%s %s", j, i, result.rowcount, sql)
            db.execute(sql)
            i+=1

示例#48

0

显示文件

def save():
    db = get_db_engine()
    content = open(FLAGS.file)
    html = '<html><body>'
    for l in content:
        key = l.split('\t')
        result = db.execute("select fastdfs_filename from item_images where uniq_url=%s limit 10", key[0])
        if result.rowcount > 0:
            html += '<div>'
            #for r in result:
            #    html += '<p><img src="http://img%s.guang.j.cn/%s"></p>' % (random.randint(1,5), r[0])
            html += '<p><img src="http://img%s.guang.j.cn/%s"></p>' % (random.randint(1,5), list(result)[0][0])
            html += '</div>'
    html += '</body></html>'
    print html

示例#49

0

显示文件

def rollback_shop(shop_id, db):
    if not db:
        db = get_db_engine()

    sql = "select id,num_id,null,null from item where shop_id = %s and detail_url like '%%%%s.click.taobao.com%%%%'" % shop_id
    if FLAGS.limit:
        sql += " limit " + str(FLAGS.limit)
    results = db.connect().execute(sql)
    for i, result in enumerate(results):
        new_url = "http://item.taobao.com/item.htm?id=%s" % result[1]
        sql = "update item set detail_url=\'%s\' where id = %s" % (new_url,
                                                                   result[0])
        logger.debug("Run sql %s/%s: %s" % (i, results.rowcount, sql))
        if not FLAGS.dryrun:
            db.execute(sql)

示例#50

0

显示文件

def main():
    url = "http://%s:7080%s" % (FLAGS.solr_host, SOLR_URL)
    #import pdb; pdb.set_trace()
    results = simplejson.loads(download(url))
    db = get_db_engine()
    counts = []
    for doc in results['response']['docs']:
        item_id = doc['item_id']
        count = db.execute("select count(id) from favourite where itemid=%s and acttime>'2012-12-01' and favstatus=1 and firstchoose=0;" % item_id)
        if count.rowcount:
            counts.append(list(count)[0][0])
        else:
            counts.append(0)
    cs = Series(counts)
    logger.info(cs.describe())

示例#51

0

显示文件

文件： generate_dup_static.py 项目： ljb-2000/tb-crawler

def remove():
    db = get_db_engine()
    content = open(FLAGS.file)
    j = 0
    for l in content:
        key = l.split("\t")
        print ("querying %s", key[0])
        result = db.execute("select id from item_images where uniq_url=%s and disabled=0", key[0])
        i = 0
        j += 1
        for r in result:
            sql = "update item_images set disabled=1 where id=%s" % r[0]
            print ("deleting %s %s/%s %s", j, i, result.rowcount, sql)
            db.execute(sql)
            i += 1

示例#52

0

显示文件

文件： generate_dup_static.py 项目： ljb-2000/tb-crawler

def save():
    db = get_db_engine()
    content = open(FLAGS.file)
    html = "<html><body>"
    for l in content:
        key = l.split("\t")
        result = db.execute("select fastdfs_filename from item_images where uniq_url=%s limit 10", key[0])
        if result.rowcount > 0:
            html += "<div>"
            # for r in result:
            #    html += '<p><img src="http://img%s.guang.j.cn/%s"></p>' % (random.randint(1,5), r[0])
            html += '<p><img src="http://img%s.guang.j.cn/%s"></p>' % (random.randint(1, 5), list(result)[0][0])
            html += "</div>"
    html += "</body></html>"
    print html

示例#53

0

显示文件

def check_shops(sql):
    db = get_db_engine()

    last_time = 0
    shops = db.execute(sql)
    logger.info("checking total %s", shops.rowcount)
    failed = []
    for shop in shops:
        cur = time.time()*1000
        if cur - last_time < FLAGS.interval:
            time.sleep((FLAGS.interval-(cur-last_time))/1000.0)
        last_time = time.time()*1000
        check_one_shop(shop, failed)
    logger.info("Checked result, total %s failed %s", shops.rowcount, len(failed))
    for f in failed:
        logger.warn("%s %s", f['shopid'], f['err'])