def GET(self, id): db = get_db_engine() results = db.execute("select crawl_item_images.url, crawl_item_images.pos, crawl_item_images.type from crawl_html, crawl_item_images where crawl_item_images.item_id=crawl_html.item_id and crawl_html.item_id=%s;" % id) item_crawler = ItemCrawler(id, FLAGS.crawl_path) item_crawler.crawl(results, ((94,94), (350,350)), False) return render.crawlitem(id, item_crawler.results)
def GET(self, id): db = get_db_engine() results = db.execute( "select crawl_item_images.url, crawl_item_images.pos, crawl_item_images.type from crawl_html, crawl_item_images where crawl_item_images.item_id=crawl_html.item_id and crawl_html.item_id=%s;" % id) item_crawler = ItemCrawler(id, FLAGS.crawl_path) item_crawler.crawl(results, ((94, 94), (350, 350)), False) return render.crawlitem(id, item_crawler.results)
def crawl_item2(kwargs): #signal.signal(signal.SIGINT, signal.SIG_IGN) item = kwargs['item'] is_commit = kwargs['is_commit'] crawl_path = kwargs['crawl_path'] server_path = kwargs['server_path'] org_server_path = kwargs['org_server_path'] is_remove = kwargs['is_remove'] item_id = item[0] num_id = item[1] is_success = False crawl_result = ((item_id, {'suc1': 0, 'count1': 0, 'suc': 0, 'count': 0}),) try: conn = get_db_engine(**kwargs).connect() try: items = conn.execute("select html, desc_content from crawl_html where crawl_html.item_id=%s;" % item_id) result = list(items) if len(result) == 1: html = result[0][0] desc_content = result[0][1] html_obj = parse_html(html) thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src") if len(thumbImages) == 0: thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")] # taobao @src to @data-src if not len(thumbImages): thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src") if len(thumbImages) == 0: logger.error("crawl item %s %s not found thumb images html size %s", item_id, num_id, len(html), extra={'tags':['crawl_thumb_empty',]}) return crawl_result r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M|re.S) tr = re.compile("(.*)_\d+x\d+\.jpg$") tr_new = re.compile("(.+\.(jpg|png|gif))[^$]*.jpg$") desc_thumbs = desc_table_thumbs = lazy_desc_thumbs = [] if desc_content: desc_html = r.subn(r'\2', desc_content)[0] desc_html_obj = parse_html(desc_html) if desc_html_obj is not None: desc_table_thumbs = desc_html_obj.xpath("//table/@background") desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img[not(@data-ks-lazyload)]/@src") lazy_desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img/@data-ks-lazyload") else: logger.warn("crawl item %s %s desc content is empty!", item_id, num_id, extra={'tags':['crawl_nodesc',]}) images = [] pos = 1 for url in thumbImages: ori_url = None if tr.match(url): ori_url = tr.sub(r'\1', url) else: if tr_new.match(url): ori_url = tr_new.sub(r'\1', url) else: logger.error("crawl item %s %s thumb image urls can not be parsed!", item_id, num_id, extra={'tags':['crawl_exception',]}) images.append((ori_url, pos, 1)) pos += 1 for url in desc_table_thumbs: images.append((url, pos, 2)) pos += 1 for url in desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 2)) pos += 1 for url in lazy_desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 3)) pos += 1 logger.debug("crawling %s %s %s", item_id, num_id, images) item_crawler = ItemCrawler(item_id, num_id, crawl_path, server_path, org_server_path, kwargs['statshost'], kwargs['statsport']) item_crawler.crawl(images, ((710,10000),), is_commit, conn, is_remove) is_success = item_crawler.success crawl_result = ((item_id, item_crawler.summary),) except Exception, e: logger.error("crawl item %s %s got exception %s", item_id, num_id, traceback.format_exc(), extra={'tags':['crawl_exception',]}) finally: conn.close() Statsd.update_stats("guang.crawl.downimgcount", crawl_result[0][1]['suc1'] + crawl_result[0][1]['suc'], host = kwargs['statshost'], port = kwargs['statsport']) if is_success: logger.info("crawl item %s %s success %s", item_id, num_id, crawl_result) Statsd.increment('guang.crawl.itemimg.succ', host = kwargs['statshost'], port = kwargs['statsport']) else: logger.warn("crawl item %s %s failed %s", item_id, num_id, crawl_result, extra={'tags':['crawl_failed',]}) Statsd.increment('guang.crawl.itemimg.failed', host = kwargs['statshost'], port = kwargs['statsport'])
def crawl_item2(kwargs): #signal.signal(signal.SIGINT, signal.SIG_IGN) item = kwargs['item'] is_commit = kwargs['is_commit'] crawl_path = kwargs['crawl_path'] server_path = kwargs['server_path'] org_server_path = kwargs['org_server_path'] is_remove = kwargs['is_remove'] item_id = item[0] num_id = item[1] is_success = False crawl_result = ((item_id, { 'suc1': 0, 'count1': 0, 'suc': 0, 'count': 0 }), ) try: conn = get_db_engine(**kwargs).connect() try: items = conn.execute( "select html, desc_content from crawl_html where crawl_html.item_id=%s;" % item_id) result = list(items) if len(result) == 1: html = result[0][0] desc_content = result[0][1] html_obj = parse_html(html) thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src") if len(thumbImages) == 0: thumbImages = [ IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in html_obj.xpath("//ul[@id='J_UlThumb']//li/@style") ] # taobao @src to @data-src if not len(thumbImages): thumbImages = html_obj.xpath( "//ul[@id='J_UlThumb']//img/@data-src") if len(thumbImages) == 0: logger.error( "crawl item %s %s not found thumb images html size %s", item_id, num_id, len(html), extra={'tags': [ 'crawl_thumb_empty', ]}) return crawl_result r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M | re.S) tr = re.compile("(.*)_\d+x\d+\.jpg$") desc_thumbs = lazy_desc_thumbs = [] if desc_content: desc_html = r.subn(r'\2', desc_content)[0] desc_html_obj = parse_html(desc_html) if desc_html_obj is not None: desc_thumbs = desc_html_obj.xpath( "//*[not(@href)]/img[not(@data-ks-lazyload)]/@src") lazy_desc_thumbs = desc_html_obj.xpath( "//*[not(@href)]/img/@data-ks-lazyload") else: logger.warn("crawl item %s %s desc content is empty!", item_id, num_id, extra={'tags': [ 'crawl_nodesc', ]}) images = [] pos = 1 for url in thumbImages: images.append((tr.sub(r'\1', url), pos, 1)) pos += 1 for url in desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 2)) pos += 1 for url in lazy_desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 3)) pos += 1 logger.debug("crawling %s %s %s", item_id, num_id, images) item_crawler = ItemCrawler(item_id, num_id, crawl_path, server_path, org_server_path, kwargs['statshost'], kwargs['statsport']) item_crawler.crawl(images, ((710, 10000), ), is_commit, conn, is_remove) is_success = item_crawler.success crawl_result = ((item_id, item_crawler.summary), ) except Exception, e: logger.error("crawl item %s %s got exception %s", item_id, num_id, traceback.format_exc(), extra={'tags': [ 'crawl_exception', ]}) finally: conn.close() Statsd.update_stats("guang.crawl.downimgcount", crawl_result[0][1]['suc1'] + crawl_result[0][1]['suc'], host=kwargs['statshost'], port=kwargs['statsport']) if is_success: logger.info("crawl item %s %s success %s", item_id, num_id, crawl_result) Statsd.increment('guang.crawl.itemimg.succ', host=kwargs['statshost'], port=kwargs['statsport']) else: logger.warn("crawl item %s %s failed %s", item_id, num_id, crawl_result, extra={'tags': [ 'crawl_failed', ]}) Statsd.increment('guang.crawl.itemimg.failed', host=kwargs['statshost'], port=kwargs['statsport'])