Exemplo n.º 1
0
def get_story_urls(urls):
    downloadnum = config.DOWNLOADNUM
    db = MySQL()
    stroy_urls = {}
    downloadstoryno = []
    for url in urls:
        requests.adapters.DEFAULT_RETRIES = 5
        s = requests.session()
        s.keep_alive = False
        flag = True
        while flag:
            try:
                user_agent = user_Agent()
                res = requests.get(url, headers=user_agent)
                flag = False
                # res.request.headers  获取设置的user_agent
            except Exception as e:
                logging.error(e)
                continue
        url_reg = re.compile(r'<a href="/txt/(\d+).html">')
        allUrl = url_reg.findall(res.text)
        if downloadnum == False:
            downloadnum = len(allUrl)
        for i in allUrl[0:downloadnum]:
            story_url = "http://m.xsqishu.com/txt/" + i + ".html"
            stroy_urls[i] = story_url
            downloadstoryno.append(i)
    for num, compileurl in stroy_urls.items():
        requests.adapters.DEFAULT_RETRIES = 5
        s = requests.session()
        s.keep_alive = False
        flag = True
        while flag:
            try:
                user_agent = user_Agent()
                res = requests.get(compileurl, headers=user_agent)
                res.encoding = "gbk"
                flag = False
                # res.request.headers  获取设置的user_agent
            except Exception as e:
                logging.error(e)
                continue
        reg = re.compile(r'<a href="/book/(.+).html" class="bdbtn greenBtn">')
        url = reg.findall(res.text)
        story_title_reg = re.compile(r'<h1 class="title">(.+)</h1>')
        title = story_title_reg.findall(res.text)[0]
        download_url = "http://m.xsqishu.com/book/" + url[0] + ".html"
        if db.isExistStory(num):
            msg = "小说---" + title + "---已入库"
            logging.info(msg)
        else:
            db.inertStoryUrl(num, title, download_url)
    return downloadstoryno
Exemplo n.º 2
0
def downLoadStory(storyno, urls):
    requests.adapters.DEFAULT_RETRIES = 5
    s = requests.session()
    s.keep_alive = False
    stroy_text = {}
    for url in urls:
        logging.info(url)
        flag = True
        while flag:
            try:
                user_agent = user_Agent()
                res = s.get(url, headers=user_agent)
                flag = False
                # print(res.headers["User-Agent"])
                # logging.info(res.headers["User-Agent"])
            except Exception as e:
                logging.info("- - 连接失败,正在重连- ")
                logging.error(e)
                continue

        text_reg = re.compile(
            r'<div class="articlecon font-large"><p>(.+)<br/><br/></p></div>')
        result = text_reg.findall(res.text)
        new_result = result[0].replace("<br/>", "")
        new_result.lstrip("")
        new_result = re.sub(' +', '\n  ', new_result)
        db.insertStory(url, new_result, storyno)
    return stroy_text
Exemplo n.º 3
0
def get_page_url():
    url = config.url
    db = MySQL()
    requests.adapters.DEFAULT_RETRIES = 5
    s = requests.session()
    s.keep_alive = False
    flag = True
    while flag:
        try:
            user_agent = user_Agent()
            res = requests.get(url, headers=user_agent)
            flag = False
            # res.request.headers  获取设置的user_agent
        except Exception as e:
            logging.error(e)
            continue
    max_index_reg = re.compile(r'<a id="pt_mulu">\d+/(\d+)</a>')
    max_index = max_index_reg.findall(res.text)[0]
    logging.info(max_index)
    already_index_count = db.getStoryPageIndexCount()
    if already_index_count < int(max_index):
        for i in range(already_index_count + 1, int(max_index) + 1):
            if i == 1:
                page_url = "http://m.xsqishu.com/newbook/index.html"
            else:
                page_url = "http://m.xsqishu.com/newbook/index_" + str(
                    i) + ".html"
            db.inertStoryPageIndex(i, page_url)
            msg = "新增第" + str(i) + "页"
            logging.info(msg)
    else:
        logging.info("当前总页数未更新")
Exemplo n.º 4
0
def getStoryContentUrl(storyno, url):
    storyContentNum = config.STORYNUM
    requests.adapters.DEFAULT_RETRIES = 5
    s = requests.session()
    s.keep_alive = False
    flag = True
    # logging.info(url)
    while flag:
        try:
            user_agent = user_Agent()
            res = requests.get(url, headers=user_agent)
            flag = False
            # res.request.headers  获取设置的user_agent
        except Exception as e:
            logging.error(e)
            continue
    reg = re.compile(r'http://m.xsqishu.com(.+).html')
    identical = reg.findall(url)[0]  #同一小说相同的部分
    storyurlreg = re.compile(r'<a href=(%s/\d+).html><li>' %
                             (identical))  #获取小说url
    storyUrls = storyurlreg.findall(res.text)
    newstoryUrls = []
    if storyContentNum == False:
        storyContentNum = len(storyUrls)
    for i in storyUrls[0:storyContentNum - 1]:
        reg = re.compile(r'%s/(\d+)' % (identical))
        chapter_num = reg.findall(i)[0]
        url = "http://m.xsqishu.com" + i + ".html"
        new_chapter_num = storyno + str(chapter_num.zfill(5))
        newstoryUrls.append(url)
        print(storyno, new_chapter_num, url)
        db.insetStoryContentUrl(storyno, new_chapter_num, url)
    return newstoryUrls
Exemplo n.º 5
0
def get_story_url(urls):
    stroy_urls = []
    download_urls = {}
    for url in urls:
        requests.adapters.DEFAULT_RETRIES = 5
        s = requests.session()
        s.keep_alive = False
        flag = True
        while flag:
            try:
                user_agent = user_Agent()
                res = requests.get(url, headers=user_agent)
                flag = False
                # res.request.headers  获取设置的user_agent
            except Exception as e:
                logging.error(e)
                continue
        url_reg = re.compile(r'<a href="/txt/(\d+).html">')
        allUrl = url_reg.findall(res.text)
        for i in allUrl:
            story_url = "http://m.xsqishu.com/txt/" + i + ".html"
            stroy_urls.append(story_url)
    for compileurl in stroy_urls:
        requests.adapters.DEFAULT_RETRIES = 5
        s = requests.session()
        s.keep_alive = False
        flag = True
        while flag:
            try:
                user_agent = user_Agent()
                res = requests.get(compileurl, headers=user_agent)
                res.encoding = "gbk"
                flag = False
                # res.request.headers  获取设置的user_agent
            except Exception as e:
                logging.error(e)
                continue
        reg = re.compile(r'<a href="/book/(.+).html" class="bdbtn greenBtn">')
        url = reg.findall(res.text)
        story_title_reg = re.compile(r'<h1 class="title">(.+)</h1>')
        title = story_title_reg.findall(res.text)[0]
        download_url = "http://m.xsqishu.com/book/" + url[0] + ".html"
        download_urls[title] = download_url
        msg = title + ":" + download_url
        logging.info(msg)
    return download_urls
Exemplo n.º 6
0
def get_url(url):
    requests.adapters.DEFAULT_RETRIES = 5
    s = requests.session()
    s.keep_alive = False
    flag = True
    while flag:
        try:
            user_agent = user_Agent()
            res = requests.get(url, headers=user_agent)
            flag = False
            # res.request.headers  获取设置的user_agent
        except Exception as e:
            logging.error(e)
            continue
    url_reg = re.compile(r'<a href=(/book/22/69708/\d+).html><li>')
    allUrl = url_reg.findall(res.text)
    return allUrl
Exemplo n.º 7
0
def get_index_url(indexnum, indexurl):
    db = MySQL()
    requests.adapters.DEFAULT_RETRIES = 5
    s = requests.session()
    s.keep_alive = False
    flag = True
    while flag:
        try:
            user_agent = user_Agent()
            res = requests.get(indexurl, headers=user_agent)
            flag = False
            # res.request.headers  获取设置的user_agent
        except Exception as e:
            logging.error(e)
            continue
    max_index_reg = re.compile(r'<a id="pt_mulu">\d+/(\d+)</a>')
    max_index = max_index_reg.findall(res.text)[0]
    if indexnum == 0:
        logging.info("---索引下载中,请等待---")
        for i in range(1, int(max_index) + 1):
            if i == 1:
                page_url = "http://m.xsqishu.com/newbook/index.html"
            else:
                page_url = "http://m.xsqishu.com/newbook/index_" + str(
                    i) + ".html"
            db.inertStoryPageIndex(i, page_url)
            msg = "下载第" + str(i) + "页"
            logging.info(msg)
    elif indexnum == int(max_index):
        logging.info("----当前已是最新索引,无需更新----")
    else:
        logging.info("----索引更新中,请等待----")
        for i in range(indexnum + 1, int(max_index) + 1):
            page_url = "http://m.xsqishu.com/newbook/index_" + str(i) + ".html"
            db.inertStoryPageIndex(i, page_url)
            msg = "更新第" + str(i) + "页"
            logging.info(msg)
Exemplo n.º 8
0
def get_index_url(url):
    requests.adapters.DEFAULT_RETRIES = 5
    s = requests.session()
    s.keep_alive = False
    flag = True
    while flag:
        try:
            user_agent = user_Agent()
            res = requests.get(url, headers=user_agent)
            flag = False
            # res.request.headers  获取设置的user_agent
        except Exception as e:
            logging.error(e)
            continue
    max_index_reg = re.compile(r'<a id="pt_mulu">\d+/(\d+)</a>')
    max_index = max_index_reg.findall(res.text)[0]
    page_urls = []
    for i in range(1, int(max_index)):
        if i == 1:
            page_url = "http://m.xsqishu.com/newbook/index.html"
        else:
            page_url = "http://m.xsqishu.com/newbook/index_" + str(i) + ".html"
        page_urls.append(page_url)
    return page_urls
Exemplo n.º 9
0
def get_story_urlsnew(url):
    db = MySQL()
    stroy_urls = {}
    download_urls = {}
    requests.adapters.DEFAULT_RETRIES = 5
    s = requests.session()
    s.keep_alive = False
    flag = True
    while flag:
        try:
            user_agent = user_Agent()
            res = requests.get(url, headers=user_agent)
            flag = False
            # res.request.headers  获取设置的user_agent
        except Exception as e:
            logging.error(e)
            continue
    url_reg = re.compile(r'<a href="/txt/(\d+).html">')
    allUrl = url_reg.findall(res.text)
    for i in allUrl:
        story_url = "http://m.xsqishu.com/txt/" + i + ".html"
        stroy_urls[i] = story_url
    logging.info(stroy_urls)
    for num, compileurl in stroy_urls.items():
        requests.adapters.DEFAULT_RETRIES = 5
        s = requests.session()
        s.keep_alive = False
        flag = True
        while flag:
            try:
                user_agent = user_Agent()
                res = requests.get(compileurl, headers=user_agent)
                res.encoding = "gbk"
                flag = False
                # res.request.headers  获取设置的user_agent
            except Exception as e:
                logging.error(e)
                continue
        reg = re.compile(r'<a href="/book/(.+).html" class="bdbtn greenBtn">')
        url = reg.findall(res.text)
        logging.info("-----------")
        logging.info(url)
        story_title_reg = re.compile(r'<h1 class="title">(.+)</h1>')
        title = story_title_reg.findall(res.text)[0]
        download_url = "http://m.xsqishu.com/book/" + url[0] + ".html"
        download_urls[num] = download_url
        if db.isExistStory(num):
            msg = "小说" + title + "已入库"
            logging.info(msg)
        else:
            db.inertStoryUrl(num, title, download_url)
        # logging.info(download_url)
    return download_urls


# urls=db.getStoryIndex(10)

# starttime=time.time()
# for url in urls:
#     get_story_urlsnew(url)
# endtime=time.time()
# print('Cost {} seconds'.format(endtime-starttime))

# threads=[]
# starttime=time.time()
# for i in urls:
#     t=Thread(target=get_story_urlsnew,args=[i])
#     t.start()
#     threads.append(t)
# for i in threads:
#     t.join()
# endtime=time.time()
# print('Cost {} seconds'.format(endtime-starttime))