Пример #1
0
def worker_process_pic(gearman_worker, gearman_job):
    url = gearman_job.data
    if wantudal.is_processed(url):
        logging.debug('%s is skipped as it was processed already' % (url))
        return
    logging.debug('processing %s' % (url))
    try:
        content = httplib.urlopen(url)[2]
    except HTTPError, e:
        logging.debug('http error: %s' % (e.code))
        return
Пример #2
0
def worker_process_html(gearman_worker, gearman_job):
    url = gearman_job.data
    if wantudal.is_processed(url):
        logging.debug('%s is skipped as it was processed already' % (url))
        return
    logging.debug('processing %s' % (url))
    #this web is encoded by gbk
    html_doc = httplib.urlopen(url)[2].decode('gbk')
    soup = BeautifulSoup(html_doc, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
    if is_album_index(url):
        albums = get_album_from_index(soup)
        logging.debug('%d albums are found' % (len(albums)))
        more_indexes = find_next_indexes(soup)
        logging.debug('%d sub indexes are found' % (len(more_indexes)))
        wantudal.save_url(url, pagetype=wantudal.PageType.AlbumIndex, isfinished=1)
        for album in albums:
            wantudal.save_url(album, pagetype=wantudal.PageType.AlbumPage, referrerurl=url, isfinished=0)
            submit_html_job(album)
        for index in more_indexes:
            wantudal.save_url(index, pagetype=wantudal.PageType.AlbumIndex, referrerurl=url, isfinished=0)
            submit_html_job(index)
    elif is_album_page(url):
        details = get_detail_from_album(soup, url)
        logging.debug('%d details are found' % (len(details)))
        more_indexes = find_next_indexes(soup)
        logging.debug('%d sub indexes are found' % (len(more_indexes)))
        wantudal.save_url(url, pagetype=wantudal.PageType.AlbumPage, isfinished=1)
        for detail in details:
            wantudal.save_url(detail, pagetype=wantudal.PageType.DetailPage, referrerurl=url, isfinished=0)
            submit_html_job(detail)
        for index in more_indexes:
            wantudal.save_url(index, pagetype=wantudal.PageType.AlbumPage, referrerurl=url, isfinished=0)
            submit_html_job(index)
    elif is_detail_page(url):
        pic, description = get_content_from_detail(soup, url)
        wantudal.save_url(url, pagetype=wantudal.PageType.DetailPage, isfinished=1)
        wantudal.save_url(pic, pagetype=wantudal.PageType.PicturePage, referrerurl=url, description=description, isfinished=0)
        submit_pic_job(pic)
    else:
        logging.debug('unknown resource')
Пример #3
0
def get_content_from_detail(soup, url):
    '''
    <script id="pix-json-set-info" type="application/json">
    {
        "albumId": 17327808,
        "likeStatus": false,
        "categoryId":"11",
        "categoryEname":"chongwu",
        "prevAlbumUrl": "",
        "nextAlbumUrl": "/detail/52085246?u=60786793" 
    }
    </script>
    '''
    picId = urlsplit(url).path.split('/')[-1]
    userId = parse_qs(urlsplit(url).query)['u'][0]
    albumId = json.loads(soup.find('script', id='pix-json-set-info').string)['albumId']
    ajax_url = 'http://wantu.taobao.com/ajax/PicDetailAjax.do?picId=%s&userId=%s&albumId=%s&t=1365154666759&_method=read'
    ajax_url = ajax_url % (picId, userId, albumId)
    resp = json.loads(httplib.urlopen(ajax_url)[2].decode('gbk'))
    picture = resp['data']['models'][0]['picPath']
    description = httplib.html_unescape(resp['data']['models'][0]['desc'])
    return (picture, description)