Exemplo n.º 1
0
    def GET(self, id):
        db = get_db_engine()
        results = db.execute("select crawl_item_images.url, crawl_item_images.pos, crawl_item_images.type from crawl_html, crawl_item_images where crawl_item_images.item_id=crawl_html.item_id and crawl_html.item_id=%s;" % id)
        item_crawler = ItemCrawler(id, FLAGS.crawl_path)
        item_crawler.crawl(results, ((94,94), (350,350)), False)

        return render.crawlitem(id, item_crawler.results)
Exemplo n.º 2
0
    def GET(self, id):
        db = get_db_engine()
        results = db.execute(
            "select crawl_item_images.url, crawl_item_images.pos, crawl_item_images.type from crawl_html, crawl_item_images where crawl_item_images.item_id=crawl_html.item_id and crawl_html.item_id=%s;"
            % id)
        item_crawler = ItemCrawler(id, FLAGS.crawl_path)
        item_crawler.crawl(results, ((94, 94), (350, 350)), False)

        return render.crawlitem(id, item_crawler.results)
Exemplo n.º 3
0
def crawl_item2(kwargs):
    #signal.signal(signal.SIGINT, signal.SIG_IGN)
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    crawl_path = kwargs['crawl_path']
    server_path = kwargs['server_path']
    org_server_path = kwargs['org_server_path']
    is_remove = kwargs['is_remove']

    item_id = item[0]
    num_id = item[1]
    is_success = False
    crawl_result = ((item_id, {'suc1': 0, 'count1': 0, 'suc': 0, 'count': 0}),)
    try:
        conn = get_db_engine(**kwargs).connect()
        try:
            items = conn.execute("select html, desc_content from crawl_html where crawl_html.item_id=%s;" % item_id)
            result = list(items)
            if len(result) == 1:
                html = result[0][0]
                desc_content = result[0][1] 
                html_obj = parse_html(html)
                thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src")
                if len(thumbImages) == 0:
                    thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")]
                    # taobao @src to @data-src
                    if not len(thumbImages):
                        thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src")

                if len(thumbImages) == 0:
                    logger.error("crawl item %s %s not found thumb images html size %s", item_id, num_id, len(html), extra={'tags':['crawl_thumb_empty',]})
                    return crawl_result

                r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M|re.S)
                tr = re.compile("(.*)_\d+x\d+\.jpg$")
                tr_new = re.compile("(.+\.(jpg|png|gif))[^$]*.jpg$")
                desc_thumbs = desc_table_thumbs = lazy_desc_thumbs = []
                if desc_content:
                    desc_html = r.subn(r'\2', desc_content)[0]
                    desc_html_obj = parse_html(desc_html)
                    if desc_html_obj is not None:
                        desc_table_thumbs = desc_html_obj.xpath("//table/@background")
                        desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img[not(@data-ks-lazyload)]/@src")
                        lazy_desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img/@data-ks-lazyload")
                else:
                    logger.warn("crawl item %s %s desc content is empty!", item_id, num_id, extra={'tags':['crawl_nodesc',]})

                images = []
                pos = 1
                for url in thumbImages:
                    ori_url = None
                    if tr.match(url):
                        ori_url = tr.sub(r'\1', url)
                    else:
                        if tr_new.match(url):
                            ori_url = tr_new.sub(r'\1', url)
                        else:
                            logger.error("crawl item %s %s thumb image urls can not be parsed!", item_id, num_id, extra={'tags':['crawl_exception',]})

                    images.append((ori_url, pos, 1))
                    pos += 1
                for url in desc_table_thumbs:
                    images.append((url, pos, 2))
                    pos += 1
                for url in desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 2))
                        pos += 1
                for url in lazy_desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 3))
                        pos += 1

                logger.debug("crawling %s %s %s", item_id, num_id, images)
                item_crawler = ItemCrawler(item_id, num_id, crawl_path, server_path, org_server_path, kwargs['statshost'], kwargs['statsport'])
                item_crawler.crawl(images, ((710,10000),), is_commit, conn, is_remove)
                is_success = item_crawler.success
                crawl_result = ((item_id, item_crawler.summary),)
        except Exception, e:
            logger.error("crawl item %s %s got exception %s", item_id, num_id, traceback.format_exc(), extra={'tags':['crawl_exception',]})
        finally:
            conn.close()
        Statsd.update_stats("guang.crawl.downimgcount", crawl_result[0][1]['suc1'] + crawl_result[0][1]['suc'],
            host = kwargs['statshost'], port = kwargs['statsport'])
        if is_success:
            logger.info("crawl item %s %s success %s", item_id, num_id, crawl_result)
            Statsd.increment('guang.crawl.itemimg.succ', host = kwargs['statshost'], port = kwargs['statsport'])
        else:
            logger.warn("crawl item %s %s failed %s", item_id, num_id, crawl_result, extra={'tags':['crawl_failed',]})
            Statsd.increment('guang.crawl.itemimg.failed', host = kwargs['statshost'], port = kwargs['statsport'])
Exemplo n.º 4
0
def crawl_item2(kwargs):
    #signal.signal(signal.SIGINT, signal.SIG_IGN)
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    crawl_path = kwargs['crawl_path']
    server_path = kwargs['server_path']
    org_server_path = kwargs['org_server_path']
    is_remove = kwargs['is_remove']

    item_id = item[0]
    num_id = item[1]
    is_success = False
    crawl_result = ((item_id, {
        'suc1': 0,
        'count1': 0,
        'suc': 0,
        'count': 0
    }), )
    try:
        conn = get_db_engine(**kwargs).connect()
        try:
            items = conn.execute(
                "select html, desc_content from crawl_html where crawl_html.item_id=%s;"
                % item_id)
            result = list(items)
            if len(result) == 1:
                html = result[0][0]
                desc_content = result[0][1]

                html_obj = parse_html(html)
                thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src")
                if len(thumbImages) == 0:
                    thumbImages = [
                        IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in
                        html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")
                    ]
                    # taobao @src to @data-src
                    if not len(thumbImages):
                        thumbImages = html_obj.xpath(
                            "//ul[@id='J_UlThumb']//img/@data-src")

                if len(thumbImages) == 0:
                    logger.error(
                        "crawl item %s %s not found thumb images html size %s",
                        item_id,
                        num_id,
                        len(html),
                        extra={'tags': [
                            'crawl_thumb_empty',
                        ]})
                    return crawl_result

                r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M | re.S)
                tr = re.compile("(.*)_\d+x\d+\.jpg$")
                desc_thumbs = lazy_desc_thumbs = []
                if desc_content:
                    desc_html = r.subn(r'\2', desc_content)[0]
                    desc_html_obj = parse_html(desc_html)
                    if desc_html_obj is not None:
                        desc_thumbs = desc_html_obj.xpath(
                            "//*[not(@href)]/img[not(@data-ks-lazyload)]/@src")
                        lazy_desc_thumbs = desc_html_obj.xpath(
                            "//*[not(@href)]/img/@data-ks-lazyload")
                else:
                    logger.warn("crawl item %s %s desc content is empty!",
                                item_id,
                                num_id,
                                extra={'tags': [
                                    'crawl_nodesc',
                                ]})

                images = []
                pos = 1
                for url in thumbImages:
                    images.append((tr.sub(r'\1', url), pos, 1))
                    pos += 1
                for url in desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 2))
                        pos += 1
                for url in lazy_desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 3))
                        pos += 1

                logger.debug("crawling %s %s %s", item_id, num_id, images)
                item_crawler = ItemCrawler(item_id, num_id, crawl_path,
                                           server_path, org_server_path,
                                           kwargs['statshost'],
                                           kwargs['statsport'])
                item_crawler.crawl(images, ((710, 10000), ), is_commit, conn,
                                   is_remove)
                is_success = item_crawler.success
                crawl_result = ((item_id, item_crawler.summary), )
        except Exception, e:
            logger.error("crawl item %s %s got exception %s",
                         item_id,
                         num_id,
                         traceback.format_exc(),
                         extra={'tags': [
                             'crawl_exception',
                         ]})
        finally:
            conn.close()
        Statsd.update_stats("guang.crawl.downimgcount",
                            crawl_result[0][1]['suc1'] +
                            crawl_result[0][1]['suc'],
                            host=kwargs['statshost'],
                            port=kwargs['statsport'])
        if is_success:
            logger.info("crawl item %s %s success %s", item_id, num_id,
                        crawl_result)
            Statsd.increment('guang.crawl.itemimg.succ',
                             host=kwargs['statshost'],
                             port=kwargs['statsport'])
        else:
            logger.warn("crawl item %s %s failed %s",
                        item_id,
                        num_id,
                        crawl_result,
                        extra={'tags': [
                            'crawl_failed',
                        ]})
            Statsd.increment('guang.crawl.itemimg.failed',
                             host=kwargs['statshost'],
                             port=kwargs['statsport'])