Пример #1
0
def getNextPage(html):
    """获取下一页地址,如果没有,返回None"""
    next_url = '<div class="wp-pagenavi">.*<a href="(.*?)">下一页</a>'
    next_url = parse(html, next_url)
    if len(next_url) > 0:
        return next_url[0]
    else:
        return None
BASE_DIR = getBaseDir() + "\\txt\\"


def getAuthor(html):
    regex_author = '<h1>.*?</h1>.*?<p>.*?:(.*?)</p>'
    author = re.findall(regex_author, html, re.S)
    return author[0]


book_url = "https://www.dhzw.org/book/13/13766/"
prefix = 'https://www.dhzw.org/book/13/13766/'
html = getHtmlByUrl(book_url)

book_name_regex = '<h1>(.*?)</h1>'
result = parse(html, book_name_regex)
isNull(result)
book_name = result[0]
print(book_name)

book_txt = BASE_DIR + book_name + '.txt'
book_catalog_txt = BASE_DIR + book_name + "_catalog.txt"
# deleteFile(book_txt)
# deleteFile(book_catalog_txt)

catalog_all_regex = '<dl>(.*?)</dl>'
result = parse(html, catalog_all_regex)
isNull(result)
catalogs_regex = '<dd><a href="(.*?)" .*?>(.*?)</a></dd>'
result = parse(result[0], catalogs_regex)
isNull(result)
Пример #3
0
def formattContent(content):
    content = content.replace('&nbsp;', '')
    content = content.replace('<br />', '')
    return content

book_url = "http://www.123xiaoqiang.me/modules/article/reader.php?aid=15767"
prefix = 'http://www.123xiaoqiang.me'

BASE_DIR = getBaseDir()

html = getHtmlByUrl(book_url)

# 获取书名
title_regex = '<h1>(.*?)</h1>'
title_result = parse(html, title_regex)
isNull(title_result)
title = title_result[0]

# 获取作者
author_regex = '<span>作者:(.*?)</span>'
author_result = parse(html, author_regex)
isNull(author_result)
author = '作者:' + author_result[0]

# 获取目录
catalog_div_regex = '<div class="liebiao">(.*?)</div>'
catalog_div_result = parse(html, catalog_div_regex)
isNull(catalog_div_result)
html = catalog_div_result[0]
catalog_regex = '<li><a href="(.*?)">(.*?)</a></li>'
Пример #4
0
def getPicUrl(html):
    """根据网页获取图片地址"""
    pic_url = '<p><img src="(.*?)".*?class="alignnone.*?" /></p>'
    pic_url = parse(html, pic_url)
    return pic_url
Пример #5
0
def getPicNum(html):
    """获取图片数量"""
    pic_num = '<title>.*\\[(\\d+)P\\].*</title>'
    pic_num = parse(html, pic_num)
    return pic_num[0]
Пример #6
0
def getName(html):
    """获取漫画名称"""
    name = '<h1 class="entry-title">(.*?)</h1>'
    name = parse(html, name)
    return name[0]