self.db = self.conn.xitek # 保存到db-thread(主题) def saveThread(self, thread): j = json.dumps(thread, default=lambda thread: thread.__dict__) self.db.threads.insert(json.loads(j)) def saveForum(self, forum): j = json.dumps(forum, default=lambda forum: forum.__dict__) print(j) self.db.forums.insert(json.loads(j)) def savePost(self, post): j = json.dumps(post, default=lambda post: post.__dict__) self.db.posts.insert(json.loads(j)) if __name__ == "__main__": ms = MongoStore() ms.open() forum = ForumInfo() forum.forumId = 100 forum.forumName = "测试论坛" ms.saveForum(forum) post = PostInfo() post.threadId = 1 post.content = "hello" post.postId = "2" ms.savePost(post)
def parsePage(self,pageData,pageNum): soup = BeautifulSoup(pageData,"html.parser") #print (soup.prettify()) #postlist=soup.find("div",id='postlist') tablelist=soup.find_all("table",id=re.compile('^pid')) retList = [] #现在开始解析发帖子的信息(用户,发帖时间,更新时间,帖子内容) for t in tablelist: #用户:第一个 <td class="pls" u = t.find("td",attrs={"class":"pls"}) #print ("user:"******"pls" nowrap="" valign="top" width="120"> <font class="allb" size="3"><a href="/space-uid-1843865.html" target="_blank"><b>maomaodada1979</b></a></font> <br/> <font color="black" id="small9"> 泡菜 <img alt="邮箱已验证" border="0" src="static/image/common/mailverified.gif" title="邮箱已验证" width="16px"/> <br/> 泡网分: 0.077<br/> 主题: 1<br/> 帖子: 52<br/> 注册: 2011年12月<br/> </font> </td> ''' postInfo = PostInfo() postInfo.threadId=self.threadId pattern = re.compile(r"<b>(.*?)</b>.*?注册: (.*?)<br/>",re.S) msg=u.prettify() v=re.findall(pattern,msg) postInfo.uname=v[0][0].strip() #用正则处理内容信息 #内容 m = t.find("td",id=re.compile('^postmessage_')) msg=m.prettify() #print("message:" + m.prettify()) pattern = re.compile(r"<td .*?postmessage_(.*?)\".*?>(.*?)</td>",re.S) v=re.findall(pattern,msg) postInfo.postId=v[0][0].strip() postInfo.content=v[0][1].strip() postInfo._id=postInfo.postId #处理时间,table的第2行,第1列 #注意这里用tr[2]的原因是table中还嵌套了一个table,里边也有tr td = t.find_all("tr")[2].find("td") #print("====") #print(td.prettify()) postInfo.postDate=td.get_text().strip() retList.append(postInfo) #处理分页(页面中有2个alln class,都是分页区域) ''' <span class=alln> <div class="pg"> <a href="thread-1482195-1-1-1.html" class="prev"> </a> <a href="thread-1482195-1-1-1.html">1</a> <strong>2</strong> </div> </span> ''' pageSpan=soup.find("span",attrs={"class":"alln"}) #print(pageSpan) #定位其中最大一个<a>即为页数,如果没有,那么当前页就是最后页,如果找到的最后页数小于当前页,则当前页也是最后页 listA=pageSpan.find_all("a") maxPage=pageNum for a in listA: href = a.get('href') #print("PAGE:" + href) sp = href.split("-") if int (sp[2]) >maxPage: maxPage=int(sp[2]) return (retList,maxPage)