Exemplo n.º 1
0
def singleBehavior(postID: int, savedFileName: str, seeLZ=True, imageBed=True):
    Avalon.debug_info('创建任务,ID:%d' % postID)
    if os.path.isfile(savedFileName):
        Avalon.time_info('帖子已经下载并保存,跳过[ID:%d]' % postID)
        return True
    post = api(postID=postID, seeLZ=seeLZ)
    try:
        postInfo = post.getInfo()
    except FileNotFoundError:
        return False
    except SystemExit:
        return False
    Avalon.time_info('开始任务:"%s"(作者:%s)[ID:%d]' %
                     (postInfo['Title'], postInfo['Author'], postID),
                     highlight=True)
    Avalon.debug_info(postInfo)
    lastContext = []
    for i in range(1, postInfo['TotalPage'] + 1):
        Avalon.time_info('开始第%d页,共%d页' % (i, postInfo['TotalPage']))
        try:
            pageHTMLContent = post.getContent(i)
            pageMarkdownContent = post.contentToMarkdown(pageHTMLContent,
                                                         useImageBed=imageBed)
        except KeyboardInterrupt:
            Avalon.critical('用户强制退出')
            quit(1)
        except SystemExit:
            pass
        else:
            lastContext.append(pageMarkdownContent)
    lastContext = ''.join(lastContext)
    post.saveToFile(savedFileName, lastContext)
    return True
Exemplo n.º 2
0
 def getPost(self: None,
             pageNumber: int,
             ajax: bool = True,
             useTemp: bool = True):  # 获得html源文件函数
     self.__workPageNumber = pageNumber
     link = self.__postLink + str(pageNumber)
     existTemp = self.__tempSave.getSameTemp()
     if existTemp.get('html') and useTemp:
         for i in existTemp['html']:
             if int(i[1]) == int(pageNumber):
                 Avalon.debug_info('第%d页已经在临时文件中存在,跳过' % pageNumber)
                 return self.__tempSave.readFileByID(i)
     if ajax is False:
         link = link.replace('ajax=1&', '')
     for tryTimes in range(1, 11):
         try:
             postRequest = request.Request(link)
             try:
                 # 设置程序请求头,伪装爬虫(必要性存疑)
                 postRequest.add_header('User-Agent', (random.choice(
                     self.__userAgent)).replace('\n', ''))
                 postRequest.add_header('Referer',
                                        'https://tieba.baidu.com')
             except:
                 continue
             else:
                 postRead: bytes = request.urlopen(postRequest,
                                                   timeout=5).read()
                 if self.debug:
                     Avalon.debug_info('链接:"%s"请求头:%s.' %
                                       (link, postRequest.headers))
         # 错误处理
         except error.URLError as e:
             Avalon.warning("获取帖子正文失败!原因:%s(%s/10)" %
                            (str(e.reason), str(tryTimes)))
         except timeout as e:
             Avalon.warning("获取帖子正文失败!原因:%s(%s/10)" %
                            (str(e), str(tryTimes)))
         except KeyboardInterrupt:
             Avalon.critical("用户强制退出")
             quit(1)
         except:
             Avalon.warning("获取帖子正文失败!原因:未知错误(%s/10)" % tryTimes)
         # 没有错误,结束循环
         else:
             if self.debug:
                 Avalon.debug_info('Link %s Get Successed.' % link)
             break
     else:
         Avalon.error('获取失败!')
         if self.debug:
             Avalon.debug('Link:%s' % link)
         quit(1)
     if useTemp is True:
         self.__tempSave.savePostRaw(postRead.decode(errors='ignore'),
                                     pageNumber=pageNumber)
     return (postRead.decode(errors='ignore'))
Exemplo n.º 3
0
 def proccessPost(self: None, raw: str, useTemp: bool = True):
     # 将源文件转换为dict类型的数据
     # 如果你没有读过百度贴吧帖子的html源文件,那么你就不要往下看了
     # 看了你也看不明白
     postEtree = etree.HTML(raw)
     theradList = postEtree.xpath(
         '//div[@class="p_postlist"]/div[@class="p_postlist"]/div[@class]')
     if theradList == []:
         Avalon.critical('程序无法正确获得文章内容')
         quit(1)
     finalList = []
     if self.debug:
         Avalon.debug_info('帖子内容获取成功')
     for perFloor in theradList:
         floorDict = {}
         # 更改了Xpath的匹配方式
         if self.debug:
             debugText = html.unescape(etree.tostring(perFloor).decode())
             debugText.replace('', '')
         if not perFloor.xpath('./@data-field'):
             if self.debug:
                 Avalon.debug_info('因为不存在"data-field"属性,跳过对象"%s"' %
                                   str(perFloor))
             continue
         floorNum = json.loads(
             perFloor.xpath('./@data-field')[0])['content']['post_no']
         author = json.loads(
             perFloor.xpath('./@data-field')[0])['author']['user_name']
         text = perFloor.xpath('.//cc//div[@id]')
         if not text:
             #Avalon.debug_info(str(floorNum)+str(author))
             continue
         else:
             text = text[0]
         final_text = html.unescape(etree.tostring(text).decode())
         if self.debug:
             Avalon.debug_info('%s - %s' % (floorNum, author))
         floorDict['floor'] = int(floorNum)
         floorDict['author'] = author
         floorDict['text'] = final_text
         finalList.append(floorDict)
     #postFullInfo = self.postInfo()
     postFullInfo = self.postInfo
     postFullInfo['Data'] = finalList
     if useTemp is True:
         self.__tempSave.saveJson(postFullInfo, self.__workPageNumber)
     return (postFullInfo)
Exemplo n.º 4
0
    def multiThreadGetMain(self, threadNumber: int = 8):
        workQueue = queue.Queue(threadNumber)
        threadLock = threading.Lock()
        exitFlag = False
        threadList = []

        def mainFloorThread():
            while not exitFlag:
                threadLock.acquire()
                if not workQueue.empty():
                    pageNumber = workQueue.get()
                    threadLock.release()
                    self.__getPostBehavior(pageNumber)
                else:
                    threadLock.release()
                    time.sleep(1)

        for i in range(threadNumber):
            threadName = 'PostThread #%s' % i
            newThread = threading.Thread(target=mainFloorThread)
            newThread.setName(threadName)
            newThread.start()
            threadList.append(newThread)

        self.__getPostBehavior(1)
        dbRead = self.__db.checkExistPage(1)[1]
        if not dbRead:
            Avalon.critical('Can\'t Get Page 1,Program Exit!')
            quit(1)
        totalPages = int(json.loads(dbRead)['page']['total_page'])
        for i in range(totalPages):
            workQueue.put(i + 1)
        while not workQueue.empty():
            time.sleep(1)
        exitFlag = True
        for i in threadList:
            i.join()
        Avalon.info('[%s]Get All Pages Success' % self.__tid)
Exemplo n.º 5
0
 def getPostInfo(self: None):
     postRaw = self.getPost(pageNumber=1, ajax=False, useTemp=False)
     postGet = etree.HTML(postRaw)
     if postGet.xpath('//body[@class="page404"]'):
         Avalon.error('此贴已经被删除!')
         raise FileNotFoundError
     postTitle = postGet.xpath('//div[@class="wrap2"]//h3/@title')
     postAuthor = postGet.xpath(
         '//div[@class="p_postlist"]/div[@class][1]//div/@author')
     postPageNum = postGet.xpath(
         '//div/div[@id]//div[@id]//li/span[@class="red"][last()]/text()')
     if not (postTitle and postAuthor and postPageNum):
         Avalon.critical('程序无法正确获得帖子信息')
         quit(1)
     finalInfo = {
         'Author': str(postAuthor[0]),
         'Title': str(postTitle[0]),
         'TotalPage': int(postPageNum[0])
     }
     self.postInfo = finalInfo
     if self.debug:
         Avalon.debug_info(self.postInfo)
     return finalInfo
Exemplo n.º 6
0
    def __bytes__(self):
        #返回bytes类型的表单数据,通过data参数post提交
        buffer = io.BytesIO()
        boundary = b'--' + self.boundary + b'\r\n'

        #增加表单项
        for name, value in self.form_fields:
            buffer.write(boundary)
            buffer.write(self._form_data(name))
            buffer.write(b'\r\n')
            buffer.write(value.encode())
            buffer.write(b'\r\n')

        #增加表单文件
        for f_name, filename, f_content_type, body in self.files:
            buffer.write(boundary)
            buffer.write(self._attached_file(f_name, filename))
            buffer.write(self._content_type(f_content_type))
            buffer.write(b'\r\n')
            buffer.write(body)
            buffer.write(b'\r\n')

        buffer.write(b'--' + self.boundary + b'--\r\n')
        return buffer.getvalue()


if __name__ == "__main__":
    Avalon.critical('模块非法调用!请运行Main.py!')
    quit(1)
Exemplo n.º 7
0
        else:
            break

Avalon.debug_info('程序已经启动...正在获取帖子信息')
post = api(postID=postID, seeLZ=onlySeeLZ, debug=GENERAL_DEBUG_MODE)
postInfo = post.getInfo()
Avalon.time_info('开始任务:"%s"(作者:%s)' % (postInfo['Title'], postInfo['Author']),
                 highlight=True)
for i in range(1, postInfo['TotalPage'] + 1):
    Avalon.time_info('开始第%d页,共%d页' % (i, postInfo['TotalPage']))
    try:
        pageHTMLContent = post.getContent(i)
        pageMarkdownContent = post.contentToMarkdown(pageHTMLContent,
                                                     useImageBed=USE_IMAGE_BED)
    except KeyboardInterrupt:
        Avalon.critical('用户强制退出')
        quit(1)
    else:
        post.saveToFile(fileName, pageMarkdownContent)
"""
markdown = markdown()
image = image(debug=GENERAL_DEBUG_MODE)

fullBehavior(postID=postID,fileName=fileName,onlySeeLZ=onlySeeLZ)

def fullBehavior(postID,fileName,onlySeeLZ=True):
    posts = spider(postID=int(postID), seeLZ=onlySeeLZ)

    info = posts.getPostInfo()
    Avalon.info('帖子标题:%s,帖子作者%s.' % (info['title'], info['author']))