示例#1
0
 def download_catalog_biqukan(self, names):
     logger_spider.debug("download_catalog_biqukan names={}".format(names))
     # 返回的结果
     return_result = {
         "source_name": "笔趣看",
         "source_url": "https://www.biqukan.com/",
         "source_img_url": "https://www.biqukan.com/images/logo.png"
     }
     home_url = "https://so.biqusoso.com/s.php?ie=utf-8&siteid=biqukan.com&q="
     if type(names) == str:
         download_url = home_url + names
     elif type(names) == tuple or type(names) == list:
         download_url = home_url + "+".join(names)
     else:
         logger_spider.error(
             "download_catalog_biqukan names格式错误 names={}".format(names))
         return_result["status"] = ERROR
         return_result["information"] = "500错误"
         return return_result
     # 开始下载网页
     response = self.download_html(download_url)
     if response == False:
         logger_spider.error(
             "download_catalog_biqukan 搜索下载目录失败 url={}".format(
                 download_url))
         return_result["status"] = ERROR
         return_result["information"] = "服务器爬虫请求失败"
         return return_result
     content = response.text
     html = BeautifulSoup(content, "lxml")
     # 不要第一行的标题
     li = html.select(".search-list ul li")[1:]
     # 开始提取其中的内容
     result = []
     for line in li:
         span = line.find_all("span")
         item = {}
         if len(span) > 1:
             item["name"] = str(span[1].string).strip()
             item["download_url"] = span[1].a.attrs["href"]
             # 原来是会到详细页面爬取更多信息的,但是这样导致速度过慢,因此放弃
             # try:
             #   response_novel = self.download_html(item["download_url"])
             #   if response_novel == False:
             #     raise Exception("下载小说详细页面失败 url={}".format(item["download_url"]))
             #   novel_html = BeautifulSoup(response_novel.content.decode("gbk","replace"),"lxml")
             #   img_url = "https://www.biqukan.com" + novel_html.select_one(".cover img").attrs["src"]
             #   item["imageList"] = [img_url]
             #   introduction = [x for x in novel_html.select_one(".intro").stripped_strings]
             #   item["introduction"] = ["简介:" + str(introduction[1])]
             # except KeyError:
             #   logger_spider.exception("download_catalog_biqukan book_name={} 没有href".format(item["name"]))
             # except:
             #   logger_spider.exception("download_catalog_biqukan response_novel_error")
         if len(span) > 2:
             item["introduction"] = ["作者:" + str(span[2].string).strip()]
             # 注释原因同上
             # if item.get("introduction"):
             #   item["introduction"] = ["作者:" + str(span[2].string).strip()] + item["introduction"]
             # else:
             #   item["introduction"] = ["作者:" + str(span[2].string).strip()]
         if item != {}:
             result.append(item)
     # 总的元素数目
     return_result["content"] = result
     return_result["length"] = len(return_result["content"])
     return_result["status"] = SUCCESS
     return return_result
示例#2
0
    def download_novel_biqukan(self, url):
        logger_spider.debug("download_novel_biqukan url={}".format(url))
        response = self.download_html(url)
        if response == False:
            logger_spider.exception(
                "download_novel_biqukan 获取小说详细页面失败 url={}".format(url))
            return {"status": False, "information": "获取小说详细页面失败"}
        html = BeautifulSoup(response.content.decode("gbk", "replace"), "lxml")
        # 名称
        novel_name = "biqukan小说.txt"
        try:
            novel_name = str(html.select_one(
                ".info h2").string).strip() + ".txt" or novel_name
        except:
            logger_spider.exception("download_novel_biqukan url={} name获取失败")
        # 先获取小说的一些详细信息
        redis_item = {}
        detail = {}
        detail["imageList"] = [
            "https://www.biqukan.com" +
            html.select_one(".cover img").attrs["src"]
        ]
        introduction = [x for x in html.select_one(".intro").stripped_strings]
        detail["introduction"] = ["简介:" + str(introduction[1])]
        redis_item["detail"] = detail
        self.connect.hset(download_process, url, json.dumps(redis_item))
        # 内容下载
        block = html.select_one(".listmain")
        dt = block.find_all("dt")
        if len(dt) < 1:
            logger_spider.error(
                "download_novel_biqukan block={} 未找到dt".format(block))
            return {"status": False, "information": "页面解析失败"}
        dt = dt[-1]
        download_list = []
        for x in dt.next_siblings:
            if x.name == "dd":
                item = {}
                try:
                    item["content"] = str(x.string.strip())
                    item["url"] = "http://www.biqukan.com" + x.a.attrs["href"]
                except:
                    logger_spider.exception(
                        "download_novel_biqukan dd={}".format(x))
                if item != {}:
                    download_list.append(item)

        novel_contents = ""
        length = len(download_list)
        # 准备每章节进行下载
        for index in range(length):
            download_info = download_list[index]
            logger_spider.debug("当前下载:{} 主页={} 章节={}".format(
                download_info["content"], url, download_info["url"]))
            # 每下载30章就向redis数据库中更新一次进度
            if index % 30 == 0:
                redis_item = self.connect.hget(download_process, url)
                if redis_item == None:
                    redis_item = {}
                else:
                    redis_item = json.loads(redis_item)
                # 大约还要等待的时间,假设每章的时间为0.3s,取整
                redis_item["percent"] = int((length - index) * 0.3)
                redis_item["progress"] = "{}/{}".format(index, length)
                self.connect.hset(download_process, url,
                                  json.dumps(redis_item))
                # self.connect.hset(download_process,url,"{}/{}".format(index,length))
                # self.connect.set(url,"{}/{}".format(index,length))
            response = self.download_html(download_info["url"])
            if response == False:
                logger_spider.exception(
                    "download_novel_biqukan 下载章节失败 url={}".format(
                        download_info["url"]))
                # 下载失败,那么在明文中做出标记
                novel_contents += "{}\n{}\n{}\n{}\n\n".format(
                    "#" * 20, download_info["content"], "因为不可控因素,该章节下载失败,敬请谅解",
                    "#" * 20)
            else:
                try:
                    html = BeautifulSoup(
                        response.content.decode("gbk", "replace"), "lxml")
                    content_div = html.select_one("#content")
                    scripts = content_div.find_all("script")
                    # 清除script标签的内容
                    for script in scripts:
                        script.clear()
                    contents = "\n".join(
                        [x for x in content_div.stripped_strings])
                    novel_contents = novel_contents + download_info[
                        "content"] + "\n" + contents + "\n\n"
                except:
                    logger_spider.exception(
                        "download_novel_biqukan 使用beautifulsoup提取html信息失败")
                    novel_contents += "{}\n{}\n{}\n{}\n\n".format(
                        "#" * 20, download_info["content"],
                        "异常因素,该章节下载失败,请联系[email protected]", "#" * 20)
        # 最后先获取详细信息,然后删掉url
        redis_item = self.connect.hget(download_process, url)
        self.connect.hdel(download_process, url)
        # self.connect.delete(url)
        return_result = {
            "status": True,
            "content": novel_contents,
            "name": novel_name
        }
        if redis_item != None:
            redis_item = json.loads(redis_item)
            if redis_item.get("detail"):
                return_result["detail"] = redis_item["detail"]
        return return_result
示例#3
0
 def __init__(self):
     logger_spider.debug("Downloader")
     self.s = requests.Session()
     self.s.headers.update(self.__headers)
     self.connect = redis_connect.getConnect()
示例#4
0
 def download_catalog_linovelib(self, names=None, url=None):
     logger_spider.debug(
         "download_catalog_linovelib names={} url={}".format(names, url))
     # 返回的结果
     return_result = {
         "source_name": "哔哩轻小说",
         "source_url": "https://www.linovelib.com/",
         "source_img_url": "https://www.linovelib.com/images/logo.png"
     }
     home_url = "https://www.linovelib.com/s/"
     # 说明是首次搜索
     if url == None:
         post_data = {"searchtype": "all"}
         if type(names) == str:
             post_data["searchkey"] = names
         elif type(names) == tuple or type(names) == list:
             post_data["searchkey"] = " ".join(names)
         else:
             logger_spider.error(
                 "download_catalog_linovelib names格式错误 names={}".format(
                     names))
             return_result["status"] = ERROR
             return_result["information"] = "500错误"
             return return_result
         request_params = {
             "url": home_url,
             "method": "POST",
             "data": post_data
         }
     # 说明不是首次搜索,而是接下来的页面
     else:
         request_params = {"url": url, "method": "GET"}
     # 开始下载网页
     response = self.download_html(**request_params)
     if response == False:
         logger_spider.error(
             "download_catalog_linovelib 搜索下载目录失败 request_params={}".format(
                 request_params))
         return_result["status"] = ERROR
         return_result["information"] = "服务器爬虫请求失败"
         return return_result
     content = response.text
     html = BeautifulSoup(content, "lxml")
     # 说明搜索出来不止一条结果
     if len(response.history) <= 0:
         lis = html.select(".search-result-list")
         # 开始提取其中的内容
         result = []
         for li in lis:
             item = {}
             try:
                 item[
                     "download_url"] = "https://www.linovelib.com" + li.select_one(
                         ".imgbox a").attrs["href"]
                 item["download_url"] = ".".join(
                     item["download_url"].split(".")[:-1]) + "/catalog"
                 item["imageList"] = [
                     li.select_one(".imgbox a img").attrs["src"]
                 ]
                 item["name"] = str(
                     li.select_one("h2.tit a").string).strip()
                 item["introduction"] = []
                 bookinfo = li.select_one(".bookinfo")
                 bookinfo_list = [str(x) for x in bookinfo.stripped_strings]
                 if len(bookinfo_list) >= 1 and bookinfo_list[-1] == "万":
                     bookinfo_list = bookinfo_list[:-1]
                 bookinfo_list = [x for x in bookinfo_list if x != "|"]
                 bookinfo_str = " ".join(bookinfo_list)
                 regular = re.compile(r"""towan\('(\d*)'\)""")
                 bookinfo_result = re.sub(regular, r"\1", bookinfo_str)
                 if bookinfo_result != "":
                     item["introduction"].append(bookinfo_result)
                 key_word = li.select_one(".key-word").string
                 if key_word != None:
                     item["introduction"].append("关键词:" +
                                                 str(key_word).strip())
                 introduction = li.p.string
                 if introduction != None:
                     item["introduction"].append("简介:" +
                                                 str(introduction).strip())
             except:
                 logger_spider.exception(
                     "download_catalog_linovelib 网页解析错误 url={}".format(
                         response.url))
             if item != {}:
                 result.append(item)
         # 获取之后待爬取的页面
         try:
             next_page = html.select_one(".next")
             last_page = int(html.select_one(".last").string)
             # present_page = int(html.select_one(".pagelink strong").string)
         except:
             return_result["length"] = len(result)
         else:
             if next_page == None:
                 return_result["length"] = (last_page -
                                            1) * 20 + len(result)
             else:
                 next_url = "https://www.linovelib.com" + next_page.attrs[
                     "href"]
                 return_result["pointer"] = {
                     "spider": sys._getframe().f_code.co_name,
                     "params": {
                         "url": next_url
                     }
                 }
                 return_result["length"] = "{}+".format(
                     (last_page - 1) * 20)
     # 被重定向到了具体页面,说明只有一条结果
     else:
         result = []
         item = {}
         try:
             item["imageList"] = [
                 html.select_one(".book-img img").attrs["src"]
             ]
             item[
                 "download_url"] = "https://www.linovelib.com" + html.select_one(
                     ".btn-group .read-btn").attrs["href"]
             item["name"] = str(
                 html.select_one(".book-name").string).strip()
             item["introduction"] = []
             item["introduction"].append(" ".join([
                 x for x in html.select_one(".book-label").stripped_strings
             ]))
             item["introduction"].append("简介:" + "\n".join(
                 [x
                  for x in html.select_one(".book-dec").stripped_strings]))
         except:
             logger_spider.exception(
                 "download_catalog_linovelib 网页解析错误 url={}".format(
                     response.url))
         if item != {}:
             result.append(item)
         return_result["length"] = len(result)
     return_result["content"] = result
     return_result["status"] = SUCCESS
     return return_result
示例#5
0
 def download_catalog_aixdzs(self, names, page=None):
     logger_spider.debug("download_catalog_aixdzs names={} page={}".format(
         names, page))
     # 返回的结果
     return_result = {
         "source_name": "爱下电子书",
         "source_url": "https://m.aixdzs.com/",
         "source_img_url": "https://www.aixdzs.com/style/img/logo.jpg"
     }
     home_url = "https://m.aixdzs.com/search?k="
     if type(names) == str:
         download_url = home_url + names
     elif type(names) == tuple or type(names) == list:
         download_url = home_url + "+".join(names)
     else:
         logger_spider.error(
             "download_catalog_aixdzs names格式错误 names={}".format(names))
         return_result["status"] = ERROR
         return_result["information"] = "500错误"
         return return_result
     # 目录的页数,默认是第一页
     if page != None:
         download_url += "&page={}".format(page)
     # 开始下载网页
     response = self.download_html(download_url)
     if response == False:
         logger_spider.error(
             "download_catalog_aixdzs 搜索下载目录失败 url={}".format(download_url))
         return_result["status"] = ERROR
         return_result["information"] = "服务器爬虫请求失败"
         return return_result
     content = response.text
     html = BeautifulSoup(content, "lxml")
     lis = html.select(".ix-list li")
     # 开始提取其中的内容
     result = []
     for li in lis:
         item = {}
         # 图片信息
         img_div = li.select_one(".ix-list-img-square img")
         img_url = img_div.attrs["src"]
         # 这张图片的意思是暂无封面
         abandon_img = ["https://img22.aixdzs.com/nopic2.jpg"]
         if img_url not in abandon_img:
             item["imageList"] = [img_url]
         # 文字信息
         info_div = li.select_one(".ix-list-info")
         item["name"] = str(info_div.h3.a.string)
         item[
             "download_url"] = "https://m.aixdzs.com" + info_div.h3.a.attrs[
                 "href"]
         introduction = []
         author = info_div.select_one(".meta .meta-l a")
         author_content = author.string
         if author_content != None:
             introduction.append("作者:" + str(author_content).strip())
         article_type = info_div.select(".meta .meta-r em")
         article_type_content = [
             str(x.string).strip() for x in article_type if x.string != None
         ]
         if article_type_content != []:
             introduction.append(" ".join(article_type_content))
         introduction_content = info_div.p.string
         if introduction_content != None:
             introduction.append("简介:" + str(introduction_content).strip())
         item["introduction"] = introduction
         result.append(item)
     # 获取后续页面
     inputs = html.select("#page,#maxpage")
     if inputs == [] or len(inputs) < 2:
         return_result["length"] = len(result)
     else:
         present_page = int(inputs[0].attrs["value"])
         max_page = int(inputs[1].attrs["value"])
         if present_page >= max_page:
             return_result["length"] = (max_page - 1) * 20 + len(result)
         else:
             return_result["length"] = str((max_page - 1) * 20) + "+"
             return_result["pointer"] = {
                 "spider": sys._getframe().f_code.co_name,
                 "params": {
                     "names": names,
                     "page": present_page + 1
                 }
             }
     return_result["status"] = SUCCESS
     return_result["content"] = result
     return return_result