Exemplo n.º 1
0
def get_story_urls(urls):
    downloadnum = config.DOWNLOADNUM
    db = MySQL()
    stroy_urls = {}
    downloadstoryno = []
    for url in urls:
        requests.adapters.DEFAULT_RETRIES = 5
        s = requests.session()
        s.keep_alive = False
        flag = True
        while flag:
            try:
                user_agent = user_Agent()
                res = requests.get(url, headers=user_agent)
                flag = False
                # res.request.headers  获取设置的user_agent
            except Exception as e:
                logging.error(e)
                continue
        url_reg = re.compile(r'<a href="/txt/(\d+).html">')
        allUrl = url_reg.findall(res.text)
        if downloadnum == False:
            downloadnum = len(allUrl)
        for i in allUrl[0:downloadnum]:
            story_url = "http://m.xsqishu.com/txt/" + i + ".html"
            stroy_urls[i] = story_url
            downloadstoryno.append(i)
    for num, compileurl in stroy_urls.items():
        requests.adapters.DEFAULT_RETRIES = 5
        s = requests.session()
        s.keep_alive = False
        flag = True
        while flag:
            try:
                user_agent = user_Agent()
                res = requests.get(compileurl, headers=user_agent)
                res.encoding = "gbk"
                flag = False
                # res.request.headers  获取设置的user_agent
            except Exception as e:
                logging.error(e)
                continue
        reg = re.compile(r'<a href="/book/(.+).html" class="bdbtn greenBtn">')
        url = reg.findall(res.text)
        story_title_reg = re.compile(r'<h1 class="title">(.+)</h1>')
        title = story_title_reg.findall(res.text)[0]
        download_url = "http://m.xsqishu.com/book/" + url[0] + ".html"
        if db.isExistStory(num):
            msg = "小说---" + title + "---已入库"
            logging.info(msg)
        else:
            db.inertStoryUrl(num, title, download_url)
    return downloadstoryno
Exemplo n.º 2
0
def get_story_urlsnew(url):
    db = MySQL()
    stroy_urls = {}
    download_urls = {}
    requests.adapters.DEFAULT_RETRIES = 5
    s = requests.session()
    s.keep_alive = False
    flag = True
    while flag:
        try:
            user_agent = user_Agent()
            res = requests.get(url, headers=user_agent)
            flag = False
            # res.request.headers  获取设置的user_agent
        except Exception as e:
            logging.error(e)
            continue
    url_reg = re.compile(r'<a href="/txt/(\d+).html">')
    allUrl = url_reg.findall(res.text)
    for i in allUrl:
        story_url = "http://m.xsqishu.com/txt/" + i + ".html"
        stroy_urls[i] = story_url
    logging.info(stroy_urls)
    for num, compileurl in stroy_urls.items():
        requests.adapters.DEFAULT_RETRIES = 5
        s = requests.session()
        s.keep_alive = False
        flag = True
        while flag:
            try:
                user_agent = user_Agent()
                res = requests.get(compileurl, headers=user_agent)
                res.encoding = "gbk"
                flag = False
                # res.request.headers  获取设置的user_agent
            except Exception as e:
                logging.error(e)
                continue
        reg = re.compile(r'<a href="/book/(.+).html" class="bdbtn greenBtn">')
        url = reg.findall(res.text)
        logging.info("-----------")
        logging.info(url)
        story_title_reg = re.compile(r'<h1 class="title">(.+)</h1>')
        title = story_title_reg.findall(res.text)[0]
        download_url = "http://m.xsqishu.com/book/" + url[0] + ".html"
        download_urls[num] = download_url
        if db.isExistStory(num):
            msg = "小说" + title + "已入库"
            logging.info(msg)
        else:
            db.inertStoryUrl(num, title, download_url)
        # logging.info(download_url)
    return download_urls


# urls=db.getStoryIndex(10)

# starttime=time.time()
# for url in urls:
#     get_story_urlsnew(url)
# endtime=time.time()
# print('Cost {} seconds'.format(endtime-starttime))

# threads=[]
# starttime=time.time()
# for i in urls:
#     t=Thread(target=get_story_urlsnew,args=[i])
#     t.start()
#     threads.append(t)
# for i in threads:
#     t.join()
# endtime=time.time()
# print('Cost {} seconds'.format(endtime-starttime))