Пример #1
0
def padmag(url):
    page = fetch(url)
    dom = html.fromstring(page)
    title = sel('title')(dom)[0].text.strip()
    tags = map(esc_tag, title.split(' - '))

    def extract_desc():
        desc = []

        def exclude(a):
            for k in ['attachment_id=', 'tag=', 'cat=']:
                if a.find(k) >= 0:
                    return True
            return False

        for p in sel('div.content p')(dom):
            if p.text:
                desc.append(p.text)
            for a in sel('a')(p):
                href = a.attrib['href']
                if href and not exclude(href):
                    desc.append(href)

        return '\n'.join(filter(None, map(lambda i:i.strip(), desc)))

    desc = extract_desc()
    pics = []
    i = 1
    for img in sel('div.content img')(dom):
        src = img.attrib['src']
        if not src.startswith('http://www.padmag.cn/wp-content/'):
            continue
        msg = '(%d): %s' % (i, desc)
        i += 1
        yield title, tags, url, msg, src
Пример #2
0
def leica(url):
    page = fetch(url)
    dom = html.fromstring(page)
    title = sel('title')(dom)[0].text.strip()
    pos = title.find(u'』')
    tags = map(esc_tag, [title[:pos+1], title[pos+1:], u'Leica中文摄影杂志'])

    pics = []
    for i, img in enumerate(sel('p img.insertimage')(dom)):
        src = img.attrib['src']
        msg = '[%d]%s' % (i+1, title)
        yield title, tags, url, msg, src
Пример #3
0
    def extract_desc():
        desc = []

        def exclude(a):
            for k in ['attachment_id=', 'tag=', 'cat=']:
                if a.find(k) >= 0:
                    return True
            return False

        for p in sel('div.content p')(dom):
            if p.text:
                desc.append(p.text)
            for a in sel('a')(p):
                href = a.attrib['href']
                if href and not exclude(href):
                    desc.append(href)

        return '\n'.join(filter(None, map(lambda i:i.strip(), desc)))
Пример #4
0
def wsj(url):
    page = fetch(url)
    dom = html.fromstring(page)
    title = sel('title')(dom)[0].text
    tags = map(esc_tag,
        title.replace(u':', '|').replace('-', '|').replace('_', '|').split('|'))

    pics = []
    items = sel('#sliderBox li')(dom)
    sz = len(items)
    for i, li in enumerate(items):
        img = sel('img')(li)[0]
        p = sel('p')(li)[0]
        src = img.attrib['src']
        src = '/'.join(filter(lambda i: i != '..', src.split('/')))
        img = 'http://cn.wsj.com/%s' % src
        msg = '(%d/%d) %s' % (i+1, sz, p.text)

        yield title, tags, url, msg, img