def worker_process_html(gearman_worker, gearman_job): url = gearman_job.data if wantudal.is_processed(url): logging.debug('%s is skipped as it was processed already' % (url)) return logging.debug('processing %s' % (url)) #this web is encoded by gbk html_doc = httplib.urlopen(url)[2].decode('gbk') soup = BeautifulSoup(html_doc, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) if is_album_index(url): albums = get_album_from_index(soup) logging.debug('%d albums are found' % (len(albums))) more_indexes = find_next_indexes(soup) logging.debug('%d sub indexes are found' % (len(more_indexes))) wantudal.save_url(url, pagetype=wantudal.PageType.AlbumIndex, isfinished=1) for album in albums: wantudal.save_url(album, pagetype=wantudal.PageType.AlbumPage, referrerurl=url, isfinished=0) submit_html_job(album) for index in more_indexes: wantudal.save_url(index, pagetype=wantudal.PageType.AlbumIndex, referrerurl=url, isfinished=0) submit_html_job(index) elif is_album_page(url): details = get_detail_from_album(soup, url) logging.debug('%d details are found' % (len(details))) more_indexes = find_next_indexes(soup) logging.debug('%d sub indexes are found' % (len(more_indexes))) wantudal.save_url(url, pagetype=wantudal.PageType.AlbumPage, isfinished=1) for detail in details: wantudal.save_url(detail, pagetype=wantudal.PageType.DetailPage, referrerurl=url, isfinished=0) submit_html_job(detail) for index in more_indexes: wantudal.save_url(index, pagetype=wantudal.PageType.AlbumPage, referrerurl=url, isfinished=0) submit_html_job(index) elif is_detail_page(url): pic, description = get_content_from_detail(soup, url) wantudal.save_url(url, pagetype=wantudal.PageType.DetailPage, isfinished=1) wantudal.save_url(pic, pagetype=wantudal.PageType.PicturePage, referrerurl=url, description=description, isfinished=0) submit_pic_job(pic) else: logging.debug('unknown resource')
return logging.debug('processing %s' % (url)) try: content = httplib.urlopen(url)[2] except HTTPError, e: logging.debug('http error: %s' % (e.code)) return name = urlparse(url).path.split('/')[-1] filepath = os.path.join(config.root_dir) if not os.path.exists(filepath): os.mkdir(filepath) filepath = os.path.join(filepath, name) with open(filepath, 'wb') as f: f.write(content) logging.debug('%s is saved' % (filepath)) wantudal.save_url(url, pagetype=wantudal.PageType.PicturePage, savedpath=filepath, isfinished=1) class SafeGearmanWorker(GearmanWorker): ''' copied from http://packages.python.org/gearman/1to2.html#worker worker with exception logging and JSON encoder ''' data_encoder = JSONDataEncoder logger = logging.getLogger() logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh = logging.FileHandler(config.log_filename) fh.setFormatter(formatter) logger.addHandler(fh)