示例#1
0
        #        url = bookid + url
        yield url, tools.to_utf8(title, encoding)


def parse_page(title, text, proxy=None):
    encoding = tools.get_encoding(r_meta, text)
    r = r_title.search(text)
    if r:
        title = r.group(1).strip()
        title = tools.to_utf8(title, encoding)

    r = r_content.search(text)
    if r:
        text = tools.format_html_text(r.group(1), encoding)
    else:
        text = ''
    return title + '\r\n' * 2 + text


if __name__ == '__main__':
    text = file('2100book_a.html', 'rb').read()
    print 'title=', tools.to_encode(parse_title(text))
    print 'index='
    i = 0
    for url, title in parse_index(text):
        print tools.to_encode(title), url
        i += 1
    print 'total=', i
    print 'page='
    print tools.to_encode(parse_page('title', file('2100book_b.html').read()))
示例#2
0
    s = []
    for (url, title) in r_index.findall(text):
        title = title.replace(' ', ' ').strip()
        yield url, tools.to_utf8(title, encoding)
        
def parse_page(title, text, proxy=None):
    encoding = tools.get_encoding(r_meta, text)
    r = r_title.search(text)
    if r:
        title = r.group(1).strip()
        title = tools.to_utf8(title, encoding)
        
    r = r_content.search(text)
    if r:
        text = tools.format_html_text(r.group(1), encoding)
    else:
        text = ''
    return title + '\r\n'*2 + text

if __name__ == '__main__':
    text = file('bookqq_a.html', 'rb').read()
    print 'title=', tools.to_encode(parse_title(text))
    print 'index='
    i = 0
    for url, title in parse_index(text):
        print tools.to_encode(title), url
        i += 1
    print 'total=', i
    print 'page='
    print tools.to_encode(parse_page('title', file('bookqq_b.html').read()))