示例#1
0
def parse_artist(d):
    data = dict()
    pq = d.pq
    data['picture'] = pq('#sidebar .artist-image img').attr('src')
    data['review'] = pq('.review-body .editorial-text').html()
    data['genres'] = [
        {
            'name': _(e).text(),
            'url': _(e).attr('href')
        } for e in pq('.details .genres a')
    ]
    data['styles'] = [
        {
            'name': _(e).text(),
            'url': _(e).attr('href')
        } for e in pq('.details .styles a')
    ]
    data['active'] = pq('#sidebar dd.active').text()
    data['formed'] = pq('#sidebar dd.birth').text()
    # data['members'] = [
    #     {
    #         'name': m.text(),
    #         'url': m.attr('href')
    #     } for m in pq_iter(pq('#sidebar .group-members li a'))
    # ]
    data['moods'] = [
        {
            'name': _(e).text(),
            'url': _(e).attr('href')
        } for e in pq('.sidebar-module.moods a')
    ]
    data['themes'] = [
        {
            'name': _(e).text(),
            'url': _(e).attr('href')
        } for e in pq('.sidebar-module.themes a')
    ]
    data['discography'] = [
        {
            'year': e('.year').text(),
            'thumbnail': e('.thumbnail-img img').attr('src'),
            'title': e('.title a:first-child').text(),
            'url': e('.title a:first-child').attr('href'),
            'label': e('td.label .full-title').text(),
            'rating': float_or(e('td.ed-rating .allmusic.rating').attr('data-stars')),
        } for e in pq_iter(pq('#discography .album-table tbody tr'))
    ]
    data['photo_gallery'] = [
        json.loads(
            e.attr('data-large')
        ) for e in pq_iter(pq('#sidebar .media-gallery div.media-gallery-image.thumbnail'))
    ]
    return data
示例#2
0
def get_url_set(city):
    cates = ('banjia', 'baomu', 'baojie', 'weixiu', 'jiadianweixiu',
            'shumashoujiweixiu', 'kongtiaoyiji', 'jiazheng', 'zhongdiangong',
            'yuesao', 'guandao', 'bianminfuwu')
    for cate in cates:
        print 'Get list of %s' % cate
        n = 0
        while 1:
            url = r"http://%s.ganji.com/%s/f%d" % (city, cate, n)
            content = request(url)
            if content:
                doc = _(content)
                nodes = doc('.list .ft-14')
                if not nodes:
                    break
                for node in nodes:
                    url_ = _(node).attr('href')
                    text = _(node).text()
                    if not Entry.query.filter_by(title=text).first():
                        print url_
                        yield url_
            n += 32
示例#3
0
def parse_search(text):
    pq = _(text)
    return {
        'results': [
            {
                'thumbnail': r('div.image .thumbnail img').attr('src'),
                'title': r('div.title a').text(),
                'artist': {
                    'name': r('div.artist').text(),
                    'url': r('div.artist a').attr('href'),
                },
                'url': r('div.title a').attr('href'),
            } for r in pq_iter(pq('table.search-results tr'))
        ]
    }
示例#4
0
__author__ = 'tianchi.ltc'

import urllib2
from pyquery import PyQuery as _

Q_HOST = 'baidu.com'
QUERY = 'ip=' + Q_HOST
HOST_N_METHOD = 'http://ip.cn/index.php'

req = urllib2.Request(HOST_N_METHOD)
res = urllib2.urlopen(req, QUERY)
str = res.read()
# print str

# doc=pq(url=HOST_N_METHOD)  # encoding issue
doc = _(str)  # encoding issue
# print doc

print doc('.well')
示例#5
0
def pq_iter(pq):
    for e in pq:
        yield _(e)
示例#6
0
def parse_album(d):
    data = dict()
    pq = d.pq

    data['artist'] = {
        'name': pq('.album-artist a').text(),
        'url': pq('.album-artist a').attr('href')
    }
    data['title'] = pq('.album-title').text()
    data['review'] = pq('.review-body .editorial-text').html()
    data['rating'] = float_or(pq('.allmusic.rating').attr('data-stars'))
    data['release_date'] = pq('.details .release-date').text()
    data['duration'] = pq('.details .duration').text()
    data['album_art'] = json.loads(pq('div.album-art .image-container').attr('data-large'))
    data['similar_albums'] = parse_album_similar_albums(pq)
    data['genres'] = [
        {
            'name': _(e).text(),
            'url': _(e).attr('href')
        } for e in pq('.details .genres a')
    ]
    data['styles'] = [
        {
            'name': _(e).text(),
            'url': _(e).attr('href')
        } for e in pq('.details .styles a')
    ]
    data['moods'] = [
        {
            'name': _(e).text(),
            'url': _(e).attr('href')
        } for e in pq('.sidebar-module.moods a')
    ]
    data['themes'] = [
        {
            'name': _(e).text(),
            'url': _(e).attr('href')
        } for e in pq('.sidebar-module.themes a')
    ]

    data['medias'] = list()
    for media_title in pq_iter(pq('#tracks h2')):
        media = dict()
        media['name'] = media_title('.disc-num').text()
        data['medias'].append(media)
        media['tracks'] = list()
        for track_row in pq_iter(media_title.next()('tbody tr')):
            track = dict()
            track['position'] = track_row('td.tracknum').text()
            track['title'] = track_row('td.title div.title a').text()
            track['url'] = track_row('td.title div.title a').attr('href')
            media['tracks'].append(track)
            track['composers'] = list()
            track['duration'] = track_row('td.time').text()
            for composer in pq_iter(track_row('td.title div.artist a')):
                track['composers'].append({
                    'name': composer.text(),
                    'url': composer.attr('href'),
                })
            track['performers'] = list()
            for performer in pq_iter(track_row('td.performer div.primary a')):
                track['performers'].append({
                    'name': performer.text(),
                    'url': performer.attr('href')
                })
    return data
示例#7
0
def get_detail(url):
    data = {}
    content = request(url)
    if not content: return

    doc = _(content)

    # check 
    check_list = doc('.bd-box .rz-icon span')
    is_ok = False
    for e in check_list:
        if _(e).text() in (u'手机已认证', u'个人实名已认证', u'企业已认证'):
            is_ok = True
            break;

    if not is_ok:
        print 'no auth one'
        return None

    # desc
    desc_list = doc('.pr-cont .nbd')
    data['desc'] = _(desc_list[0]).text()

    # brief
    brief_list = doc('.box-cont p')
    data['brief'] = _(brief_list[1]).text()

    # title
    brief_list = doc('.box-cont h1')
    data['title'] = _(brief_list[0]).text()

    #
    c1_list, c2_list = doc('.contList')[:2] 

    # address
    data['address'] = _(c1_list[0]).find('.wt2').text()

    # serviceitems
    item_list = _(c1_list[1]).find('.wt2 a')
    items = []
    for e in item_list:
        items.append(_(e).text())
    data['serviceitems'] = items

    # worktime
    if len(c1_list) >= 3:
        data['worktime'] = _(c1_list[2]).find('.wt2').text()
    else:
        data['worktime'] = u" "

    # serviceareas
    if len(c1_list) >= 4:
        data['serviceareas'] = _(c1_list[3]).find('.wt2').text()
    else:
        data['serviceareas'] = set()

    # linkman
    if len(c2_list) >= 1:
        data['linkman'] = _(c2_list[0]).find('.wt2 strong').text()
    else:
        data['linkman'] = u" "

    # ontracts
    tel_list = doc('.tel-box span')
    tels = []
    for e in tel_list:
        maybe_tel = _(e).text()
        if is_phone.match(maybe_tel):
            tels.append(maybe_tel)
    data['contracts'] = tels

    return data