Exemplo n.º 1
0
def get_page_url():
    url = config.url
    db = MySQL()
    requests.adapters.DEFAULT_RETRIES = 5
    s = requests.session()
    s.keep_alive = False
    flag = True
    while flag:
        try:
            user_agent = user_Agent()
            res = requests.get(url, headers=user_agent)
            flag = False
            # res.request.headers  获取设置的user_agent
        except Exception as e:
            logging.error(e)
            continue
    max_index_reg = re.compile(r'<a id="pt_mulu">\d+/(\d+)</a>')
    max_index = max_index_reg.findall(res.text)[0]
    logging.info(max_index)
    already_index_count = db.getStoryPageIndexCount()
    if already_index_count < int(max_index):
        for i in range(already_index_count + 1, int(max_index) + 1):
            if i == 1:
                page_url = "http://m.xsqishu.com/newbook/index.html"
            else:
                page_url = "http://m.xsqishu.com/newbook/index_" + str(
                    i) + ".html"
            db.inertStoryPageIndex(i, page_url)
            msg = "新增第" + str(i) + "页"
            logging.info(msg)
    else:
        logging.info("当前总页数未更新")
Exemplo n.º 2
0
def get_story_urls(urls):
    downloadnum = config.DOWNLOADNUM
    db = MySQL()
    stroy_urls = {}
    downloadstoryno = []
    for url in urls:
        requests.adapters.DEFAULT_RETRIES = 5
        s = requests.session()
        s.keep_alive = False
        flag = True
        while flag:
            try:
                user_agent = user_Agent()
                res = requests.get(url, headers=user_agent)
                flag = False
                # res.request.headers  获取设置的user_agent
            except Exception as e:
                logging.error(e)
                continue
        url_reg = re.compile(r'<a href="/txt/(\d+).html">')
        allUrl = url_reg.findall(res.text)
        if downloadnum == False:
            downloadnum = len(allUrl)
        for i in allUrl[0:downloadnum]:
            story_url = "http://m.xsqishu.com/txt/" + i + ".html"
            stroy_urls[i] = story_url
            downloadstoryno.append(i)
    for num, compileurl in stroy_urls.items():
        requests.adapters.DEFAULT_RETRIES = 5
        s = requests.session()
        s.keep_alive = False
        flag = True
        while flag:
            try:
                user_agent = user_Agent()
                res = requests.get(compileurl, headers=user_agent)
                res.encoding = "gbk"
                flag = False
                # res.request.headers  获取设置的user_agent
            except Exception as e:
                logging.error(e)
                continue
        reg = re.compile(r'<a href="/book/(.+).html" class="bdbtn greenBtn">')
        url = reg.findall(res.text)
        story_title_reg = re.compile(r'<h1 class="title">(.+)</h1>')
        title = story_title_reg.findall(res.text)[0]
        download_url = "http://m.xsqishu.com/book/" + url[0] + ".html"
        if db.isExistStory(num):
            msg = "小说---" + title + "---已入库"
            logging.info(msg)
        else:
            db.inertStoryUrl(num, title, download_url)
    return downloadstoryno
Exemplo n.º 3
0
def get_index_url(indexnum, indexurl):
    db = MySQL()
    requests.adapters.DEFAULT_RETRIES = 5
    s = requests.session()
    s.keep_alive = False
    flag = True
    while flag:
        try:
            user_agent = user_Agent()
            res = requests.get(indexurl, headers=user_agent)
            flag = False
            # res.request.headers  获取设置的user_agent
        except Exception as e:
            logging.error(e)
            continue
    max_index_reg = re.compile(r'<a id="pt_mulu">\d+/(\d+)</a>')
    max_index = max_index_reg.findall(res.text)[0]
    if indexnum == 0:
        logging.info("---索引下载中,请等待---")
        for i in range(1, int(max_index) + 1):
            if i == 1:
                page_url = "http://m.xsqishu.com/newbook/index.html"
            else:
                page_url = "http://m.xsqishu.com/newbook/index_" + str(
                    i) + ".html"
            db.inertStoryPageIndex(i, page_url)
            msg = "下载第" + str(i) + "页"
            logging.info(msg)
    elif indexnum == int(max_index):
        logging.info("----当前已是最新索引,无需更新----")
    else:
        logging.info("----索引更新中,请等待----")
        for i in range(indexnum + 1, int(max_index) + 1):
            page_url = "http://m.xsqishu.com/newbook/index_" + str(i) + ".html"
            db.inertStoryPageIndex(i, page_url)
            msg = "更新第" + str(i) + "页"
            logging.info(msg)
Exemplo n.º 4
0
# Author:jiang
# 2020/10/27 14:35
downloadnum = 9  # 设置 downloadnum=False全量下载
storynums = 30  #下载的故事个数 storynum=Fasle 全量下载
indexnum = 2  #小说索引的下载地址
from util.log import logger as logging
from threading import Thread, Lock
import time
# from mysql.storyMysql import getStoryNum, getDownLoadUrl, getStoryTitle,getAllStoryText,getStoryText
from util.getStoryContentUrl import getStoryContentUrl
from util.downLoadStory import downLoadStory
from util.storyWriteTxt import storyWriteTxt
# from mysql.allStoryUrlMysql import getSrotyUrl
from mysql.mySQL import MySQL
from util.getStoryIndex import get_story_urlsnew
db = MySQL()


def main():
    storyUrls = db.getStoryIndex(indexnum)  #获取下载小说的地址
    storynos = db.getDownLoadSrotyNo(storynums)
    if len(storynos) == False:
        logging.info(storyUrls)
        for i in storyUrls:
            dict = get_story_urlsnew(i)
            for k, v in dict.items():
                getStoryContentUrl(k, v)
        # threads=[]
        # starttime=time.time()
        # for i in storyUrls:
        #     t=Thread(target=get_story_urlsnew,args=[i])
Exemplo n.º 5
0
def get_story_urlsnew(url):
    db = MySQL()
    stroy_urls = {}
    download_urls = {}
    requests.adapters.DEFAULT_RETRIES = 5
    s = requests.session()
    s.keep_alive = False
    flag = True
    while flag:
        try:
            user_agent = user_Agent()
            res = requests.get(url, headers=user_agent)
            flag = False
            # res.request.headers  获取设置的user_agent
        except Exception as e:
            logging.error(e)
            continue
    url_reg = re.compile(r'<a href="/txt/(\d+).html">')
    allUrl = url_reg.findall(res.text)
    for i in allUrl:
        story_url = "http://m.xsqishu.com/txt/" + i + ".html"
        stroy_urls[i] = story_url
    logging.info(stroy_urls)
    for num, compileurl in stroy_urls.items():
        requests.adapters.DEFAULT_RETRIES = 5
        s = requests.session()
        s.keep_alive = False
        flag = True
        while flag:
            try:
                user_agent = user_Agent()
                res = requests.get(compileurl, headers=user_agent)
                res.encoding = "gbk"
                flag = False
                # res.request.headers  获取设置的user_agent
            except Exception as e:
                logging.error(e)
                continue
        reg = re.compile(r'<a href="/book/(.+).html" class="bdbtn greenBtn">')
        url = reg.findall(res.text)
        logging.info("-----------")
        logging.info(url)
        story_title_reg = re.compile(r'<h1 class="title">(.+)</h1>')
        title = story_title_reg.findall(res.text)[0]
        download_url = "http://m.xsqishu.com/book/" + url[0] + ".html"
        download_urls[num] = download_url
        if db.isExistStory(num):
            msg = "小说" + title + "已入库"
            logging.info(msg)
        else:
            db.inertStoryUrl(num, title, download_url)
        # logging.info(download_url)
    return download_urls


# urls=db.getStoryIndex(10)

# starttime=time.time()
# for url in urls:
#     get_story_urlsnew(url)
# endtime=time.time()
# print('Cost {} seconds'.format(endtime-starttime))

# threads=[]
# starttime=time.time()
# for i in urls:
#     t=Thread(target=get_story_urlsnew,args=[i])
#     t.start()
#     threads.append(t)
# for i in threads:
#     t.join()
# endtime=time.time()
# print('Cost {} seconds'.format(endtime-starttime))