Exemplo n.º 1
0
    def _GetContentByKeyword(self, keyword, mode, download_path=None):
        """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中

        Parameters
        ----------
        keyword : str or unicode
            搜索文字

        mode    : str or unicode 
            download : 下载
            updata   :更新图片
        Returns
        -------
        success: dict[list]--self.kkmh_content
        failed : None
        """

        #请求keyword网页
        self.keyword       = keyword
        self.download_path = download_path
        url_keyword        = self._url + '/web/topic/search?keyword' +  parse.urlencode({"": keyword})
        content_keyword    = BaseRequest.GetUrlContent(url_keyword)
        if content_keyword == None:
            return False

        #将返回的内容解析
        content_keyword_json = json.loads(content_keyword.decode("utf8"))
        if content_keyword_json == False:
            return False

        #取出id关键字,从而访问搜索到的内容
        for data in content_keyword_json['data']['topic']:

            if mode == "download":
                #判断此漫画是否已经下载过
                sql = "SELECT * FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data['title'])
                if self._EntertainmentSelect(sql):
                    print(data['title'])
                    continue

            #等待上一部漫画下载完成   
            while True:
                if not priority_queue.empty():
                    print("threads conunt :%d" %threading.active_count())
                    print("queue size : %d" %(queue.qsize()))
                    if threading.active_count() < 10:
                        StartComicThread(10)  
                    time.sleep(60)
                    continue
                else:
                    break

            self.keyword         = data['title']
            url_keyword_content  = self._url + '/web/topic/' + str(data['id'])
            soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content)
            if soup_keyword_content == None:
                return False

            #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环
            a_book = soup_keyword_content.find_all('a',{'class':'article-img'})

            if mode == "download":

                a_author    = soup_keyword_content.find('div', {'class':'author-nickname'})
                a_introduce = soup_keyword_content.find('div', {'class':'switch-content'})
                a_img       = soup_keyword_content.find('img', {'class':'kk-img'})

                #下载漫画封面
                for i in range(5):
                    if download_path != None:
                        path = '%s/Comics/%s/' %(download_path, self.keyword)
                        if not BaseRequest.DownloadData(a_img['src'], path, "封面.jpg"):
                            print("download %s failed %d time" % ("封面.jpg", i))
                        else:
                            print("download %s%s success" % (path,"封面.jpg"))
                            break

                src = "https://txz-1256783950.cos.ap-beijing.myqcloud.com/Comics/" + self.keyword + "/" + "封面.jpg"

                #将漫画信息存储到数据库
                sql_dict = collections.OrderedDict()
                sql_dict['Name']      = "\"" + data['title'] + "\""         #名字
                sql_dict['WatchNum']  = 0                                   #编号  
                sql_dict['Website']   = "\"" + self._url + "\""             #网址
                sql_dict['ChapterNum']= len(a_book)                         #总共有多少章节
                sql_dict['IsFinish']  = 0                                   #是否完结
                sql_dict['Introduce'] = "\"" + a_introduce.p.contents[0].replace('\"', '') + "\""   #漫画介绍
                sql_dict['Author']    = "\"" + a_author.contents[0] + "\""  #作者
                sql_dict['Img']       = "\"" + src + "\""                   #封面图片
                sql_dict['Type']      = "\"" + self.type + "\""             #漫画类型
                sql_dict['Time']      = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间

                if not self._EntertainmentInsert('ComicName', sql_dict):
                    print("inster ComicName table failed!")
                    continue

                #获取漫画编号,唯一
                sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(data['title'])
                max_id = self._EntertainmentSelect(sql)
                if max_id:
                    self.id = max_id[0][0]
                else:
                    print("get max_id failed!")
                    continue

            elif mode == "update":
                now_Time = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间
                sql = "update EntertainmentDB.ComicName set Time = %s  where ID = %d;" %(now_Time, self.id)
                if not self._EntertainmentUpdate(sql):
                    print("%s update failed!" %(sql))

            count = 1
            for book in reversed(a_book):
                href  = book['href']
                title = book['title']
                src   = book.img['src']

                #当前章节的内容插入到队列中
                url_a_book  = self._url + href
                data = {"url": url_a_book, "title":title, "src": src, "href":href, "count": count}
                if mode == "download":
                    dic_queue = {"type": "download", "subtype": "download", "self":self, "data":data}
                elif mode == "update":
                    dic_queue = {"type": "download", "subtype": "update", "self":self, "data":data}

                priority_queue.put(base.Job(2,dic_queue,self._url))

                count += 1


                p.spawn(run)
            
            p.join()

        return True
Exemplo n.º 2
0
    def _UpdataChapter(self, result, download_path=None):
        """更新最新章节,然后将返回的内容记录在kkmh_content结构中

        Parameters
        ----------
        keyword : str or unicode
            搜索文字
        Returns
        -------
        success: dict[list]--self.kkmh_content
        failed : None
        """

        keyword     = result[1]
        chapter_num = result[4]
        self.id     = result[0]

        #请求keyword网页
        self.keyword       = keyword
        self.download_path = download_path

        url_keyword        = self._url + '/web/topic/search?keyword' +  parse.urlencode({"": keyword})
        content_keyword    = BaseRequest.GetUrlContent(url_keyword)
        if content_keyword == False:
            return None

        #将返回的内容解析
        content_keyword_json = json.loads(content_keyword.decode("utf8"))
        if content_keyword_json == False:
            return None

        
        #取出id关键字,从而访问搜索到的内容
        for data in content_keyword_json['data']['topic']:
            #获取漫画编号,唯一
            
            if data['title'] != keyword:
                continue

            url_keyword_content = self._url + '/web/topic/' + str(data['id'])
            soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content)
            if soup_keyword_content == False:
                return None

            #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环
            a_book = soup_keyword_content.find_all('a',{'class':'article-img'})

            now_chapter_num = len(a_book)
            for book in a_book:
                print(now_chapter_num, chapter_num)
                if now_chapter_num <= chapter_num:
                    return None

                
                href  = book['href']
                title = book['title']
                lst_img_book = []
                dct_img_book = {}

                #下载当前章节的内容
                url_a_book  = self._url + href
                soup_a_book = BaseRequest.GetUrlSoup(url_a_book)
                if soup_a_book == None:
                    return None

                #找到每一章节的图片地址并保存
                content_img_book = soup_a_book.find_all('img',{'class':'kklazy', 'title':title})
                for img_book in content_img_book:
                    lst_img_book.append(img_book['data-kksrc'].replace('amp;', ''))

                #将数据存储到结构体中,用于后续保存
                dct_img_book = {'href':href, 'title':title, 'chapter':now_chapter_num, 'download_url':lst_img_book}
                self.lst_kkmh_content.append(dct_img_book)

                now_chapter_num = now_chapter_num - 1
                
                yield dct_img_book
Exemplo n.º 3
0
    def _GetContentByKeyword(self, keyword):
        """通过关键字查找到需要的内容,然后将返回的内容记录在kkmh_content结构中

        Parameters
        ----------
        keyword : str or unicode
            搜索文字
        Returns
        -------
        success: dict[list]--self.kkmh_content
        failed : None
        """

        #请求keyword网页
        self.keyword       = keyword
        url_keyword        = self._url + '/web/topic/search?keyword' +  parse.urlencode({"": keyword})
        content_keyword    = BaseRequest.GetUrlContent(url_keyword)
        if content_keyword == False:
            return None

        #将返回的内容解析
        content_keyword_json = json.loads(content_keyword.decode("utf8"))
        if content_keyword_json == False:
            return None

        #取出id关键字,从而访问搜索到的内容
        url_keyword_content = self._url + '/web/topic/' + str(content_keyword_json['data']['topic'][0]['id'])
        soup_keyword_content = BaseRequest.GetUrlSoup(url_keyword_content)
        if soup_keyword_content == False:
            return None

        #将漫画信息存储到数据库
        sql_dict = collections.OrderedDict()
        sql_dict['Name']    = "\"" + self.keyword + "\""      #名字
        sql_dict['Num']     = 0                               #编号  
        sql_dict['Website'] = "\"" + self._url + "\""         #网址
        sql_dict['Time']    = "\"" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "\"" #下载时间
        if not self._ComicInsert('ComicName', sql_dict):
            print("inster ComicName table failed!")

        #获取漫画编号,唯一
        sql = "SELECT ID FROM EntertainmentDB.ComicName WHERE Name=\"%s\";" %(self.keyword)
        max_id = self._ComicSelect(sql)
        self.id = max_id[0][0]

        #找到漫画所有章节的地址,由于网页的顺序是从最后一章至第一章,所以要反向循环
        a_book = soup_keyword_content.find_all('a',{'class':'article-img'})
        for book in reversed(a_book):
            href  = book['href']
            title = book['title']
            src   = book.img['src']
            lst_img_book = []
            dct_img_book = {}

            #下载当前章节的内容
            url_a_book  = self._url + href
            soup_a_book = BaseRequest.GetUrlSoup(url_a_book)
            if soup_a_book == None:
                return None

            #找到每一章节的图片地址并保存
            content_img_book = soup_a_book.find_all('img',{'class':'kklazy', 'title':title})
            for img_book in content_img_book:
                lst_img_book.append(img_book['data-kksrc'].replace('amp;', ''))

            #将数据存储到结构体中,用于后续保存
            dct_img_book = {'href':href, 'title':title, 'src':src, 'download_url':lst_img_book}
            self.lst_kkmh_content.append(dct_img_book)

            yield dct_img_book