Пример #1
0
    def getPost(self, post, new_nr=0):
        cnt = 0
        sr = urlparse.urlsplit(post.url)
        if not sr.scheme:
            posturl = post.sector.base_url + post.url
        else:
            posturl = post.url

        posturl = "%d".join(re.findall("(.*?thread-\d+-)\d+(-.*?.html)", posturl)[0])

        startpage, stoppage = self.getPageRange(self.NR_REPLY_PER_PAGE, post.nr_reply, new_nr)

        self.logger.debug("post %s page range [%d,%d)", post.locate_id, startpage, stoppage)
        parser = etree.HTMLParser()
        for pg in xrange(startpage, stoppage):
            self.logger.debug("post %s %s ...", post.locate_id, posturl % pg)
            data = self._getData(posturl % pg, None, post.title)
            if not data:
                self.logger.debug("-=-=- !!!!!!!!!!!!!! got None or '', skip !!!!!!!!!!!!!!!!!!! -=-=-")
                continue
                # check authentication. some posts need high permission to view.
            if data.find("超级大本营军事论坛 提示信息") != -1 and data.find("您无权进行当前操作,原因如下:") != -1:
                self.logger.debug(
                    "got err %s", re.search('<div class="alert_error">(.*?)</div>', data, re.M | re.S | re.I).group(1)
                )
                if self.login():
                    data = self._getData(posturl % pg, None, post.title)
                    if not data:
                        self.logger.debug("-=-=- !!!!!!!!!!!!!! got None or '', skip !!!!!!!!!!!!!!!!!!! -=-=-")
                        continue

            tree = etree.fromstring(data, parser)

            posts = tree.xpath('//div[@id="postlist"]/div[starts-with(@id,"post_")=true()]')
            for item in posts:
                replyid = item.attrib["id"][5:]
                assert replyid
                try:
                    author = item.xpath('./table//td[@class="pls"]/div[@class="pi"]/div[@class="authi"]/a/text()')[
                        0
                    ]  # item.xpath('./table/tr[1]/td[@class="pls"]/div[@class="pi"]/div[@class="authi"]/a/text()')[0]
                except IndexError:
                    author = item.xpath('./table//td[@class="pls"]/div[@class="pi"]/text()')[0].strip()
                assert author is not None

                ##				crt_date=item.xpath('./table/tr[1]/td[@class="postcontent"]/div[@class="postinfo"]/div[@class="posterinfo"]/div[@class="authorinfo"]/em/text()')[0][4:]
                crt_date = item.xpath('./table//div[@class="pti"]/div[@class="authi"]/em/text()')[0][
                    4:
                ]  # crt_date=item.xpath('./table/tr[1]/td[@class="plc"]/div[@class="pi"]/div[@class="pti"]/div[@class="authi"]/em/text()')[0][4:]

                # './table/tr[1]/td[@class="plc"]/div[@class="pct"]/div[@class="pcb"]/div[@class="t_fsz"]/table/tr/td[@id="postmessage_%s"]'%replyid
                ##				replycontent= item.xpath('//td[@id="postmessage_%s"]/*[not(@class="a_pr" or @class="pstatus")]'%replyid)[0]
                try:
                    replycontent = item.xpath('//td[@id="postmessage_%s"]' % replyid)[0]
                except IndexError:
                    if item.xpath('./table//div[@class="pct"]/div[@class="pcb"]/div[@class="locked"]'):
                        replycontent = item.xpath('./table//div[@class="pct"]/div[@class="pcb"]/div[@class="locked"]')[
                            0
                        ]
                        replycontent = htmlentitydecode(etree.tostring(replycontent)).strip()
                    else:
                        raise
                else:
                    assert replycontent is not None
                    # remove element 'ad' and 'poststatus'
                    for i in replycontent:
                        if "class" in i.attrib and i.attrib["class"] in ("a_pr", "pstatus"):
                            textlist = re.findall(
                                r"\A<%s.*?>.*?</%s>(.*?)\Z" % (i.tag, i.tag), etree.tostring(i), re.M | re.S | re.U
                            )
                            textlist = [x for x in textlist if x.strip() != ""]
                            if len(textlist) > 0:
                                remaintext = "<br />".join(textlist)
                                newelement = item.makeelement("br")
                                newelement.text = remaintext
                                replycontent.replace(i, newelement)
                            else:
                                replycontent.remove(i)
                    replycontent = self.exclude_first_td_tag.match(
                        htmlentitydecode(etree.tostring(replycontent)).strip()
                    ).group(1)

                try:
                    r = Reply.objects.filter(post=post, locate_id=replyid)[0:1].get()
                except Reply.DoesNotExist:
                    r = Reply(post=post, locate_id=replyid, crt_date=crt_date, author=author, content=replycontent)
                    try:
                        r.save()
                    except _mysql_exceptions.Warning:
                        self.logger.debug("got _mysql_exceptions.Warning!")
                        r.content = self.exclude_first_td_tag.match(etree.tostring(item[1]).strip()[:-5]).group(1)
                        if replycontent.startswith("<br/>"):
                            replycontent = replycontent[5:]
                        r.save()

                    cnt += 1

        if new_nr != 0:
            if post.nr_reply + cnt == new_nr:
                self.logger.info("post %s %+d reply. now %d", post.locate_id, cnt, new_nr + 1)
                post.nr_reply += cnt  # 增加实际变化(新增)的数量
            else:
                self.logger.debug(
                    "post %s %+d reply, %d != expect %d (no right new_nr info?)",
                    post.locate_id,
                    cnt,
                    post.nr_reply + cnt,
                    new_nr,
                )
                # 检查实际获得数量
                actualcnt = Reply.objects.filter(post=post).count()
                self.logger.info("post %s actual %d reply in DB", post.locate_id, actualcnt)
                post.nr_reply = actualcnt - 1 if actualcnt - 1 >= 0 else 0
        else:
            if post.nr_reply + 1 == cnt:
                self.logger.info("post %s init %+d reply.", post.locate_id, cnt)
            else:
                self.logger.info("post %s init %+d reply, != expect %d", post.locate_id, cnt, post.nr_reply + 1)
            post.nr_reply = cnt - 1 if cnt - 1 >= 0 else 0  # 设为实际获得值-1,以便下次再次尝试查找新增帖子
        post.save()
Пример #2
0
	def getPost(self,post,new_nr=0):
		'''获取页面中的帖子。根据主题列表中显示的回帖数决定翻页范围'''
		cnt=0
		sr=urlparse.urlsplit(post.url)
		if not sr.scheme:
			posturl=post.sector.base_url+post.url
		else:
			posturl=post.url

		posturl+='&TopicPage=%d'
		posturl=posturl.replace('topicdisplay.asp','topicdisplay_safe.asp')

		startpage,stoppage=self.getPageRange(self.NR_REPLY_PER_PAGE,0 if post.nr_reply-1<0 else post.nr_reply-1,0 if new_nr==0 else new_nr-1)

		self.logger.debug('post %s page range [%d,%d)',post.locate_id,startpage,stoppage)
		parser=etree.HTMLParser()
		for pg in xrange(startpage,stoppage):
			if self.exitevent.is_set():
				self.logger.info('got exit signal!')
				break
			self.logger.debug('post %s %s ...',post.locate_id,posturl%pg)
			data=self._getData(posturl%pg,None,post.title)

			tree=etree.fromstring(data,parser)

			posts=tree.xpath('//table[@class="maintable"][1]/tr[position()>1]')
			for item in posts:
				try:
					author=item[0].xpath('a/text()')[0]
				except IndexError:
					self.logger.debug('no author info?')
					author=''

					if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in  htmlentitydecode(etree.tostring(item)):
						self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item)))
						continue

				try:
					crt_date=item[1].xpath('font/text()')[0][6:-1]
					# 判断是否是主题贴,因为主题贴无法获得replyid
					tmp=item[1].xpath('*[position()<5]')
					if len(tmp)==4 and [x.tag for x in tmp]==['b','hr','font','hr']: # 是主题贴
						replyid=0
						try:
							realtitle=item[1].xpath('b/text()')[0]
						except IndexError:
							try:
								realtitle=item[1].xpath('b/font/b/text()')[0]
							except IndexError:
								realtitle=item[1].xpath('b/font/text()')[0]

						if post.title!=realtitle:
							self.logger.info('post %s realtitle %s',post.locate_id,realtitle)
							post.title=realtitle
							post.save()
						# 为后面获取回复内容而删除非回复信息(主题/分割线/发表时间等)
						for x in item[1].xpath('*[position()<5]'):
							item[1].remove(x)
						item[1].text=''
					else: # 非主题贴
						replyurl=item[1].xpath('font/a[1]/@href')[0]
						replyid=re.search('ReplyID=(\d+)',replyurl).group(1)
						# 为后面获取回复内容而删除非回复信息(分割线/发表时间等)
						for x in item[1].xpath('*[position()<3]'):
							item[1].remove(x)

					replycontent=self.exclude_first_td_tag.match(htmlentitydecode(etree.tostring(item[1])).strip()).group(1)
					if replycontent.startswith('<br/>'):
						replycontent=replycontent[5:]

					if author=='':
						author='usr for %d-%s'%(post.id,replyid)

	##					open('/home/kevin/tmp_post.txt','w').write(replycontent)
					try:
						r=Reply.objects.filter(post=post,locate_id=replyid)[0:1].get()
					except Reply.DoesNotExist:
						r=Reply(post=post,locate_id=replyid,crt_date=crt_date,author=author,content=replycontent)
						try:
							r.save()
						except _mysql_exceptions.Warning:
							self.logger.debug('got _mysql_exceptions.Warning!')
							r.content=self.exclude_first_td_tag.match(etree.tostring(item[1]).strip()[:-5]).group(1)
							if replycontent.startswith('<br/>'):
								replycontent=replycontent[5:]
							r.save()

						cnt+=1
				except IndexError:
					if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in  htmlentitydecode(etree.tostring(item)):
						self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item)))
						continue
					else:
						raise
				except AttributeError:
					if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in  htmlentitydecode(etree.tostring(item)):
						self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item)))
						continue
					else:
						raise


		if new_nr!=0:
			if post.nr_reply+cnt==new_nr:
				self.logger.info('post %s %+d reply. now %d',post.locate_id,cnt,new_nr+1)
				post.nr_reply+=cnt # 增加实际变化(新增)的数量
			else:
				self.logger.debug('post %s %+d reply, %d != expect %d (no right new_nr info?)',post.locate_id,cnt,post.nr_reply+cnt,new_nr)
				# 检查实际获得数量
				actualcnt=Reply.objects.filter(post=post).count()
				self.logger.info('post %s actual %d reply in DB',post.locate_id,actualcnt)
				post.nr_reply=actualcnt-1 if actualcnt-1>=0 else 0
		else:
			if post.nr_reply+1==cnt:
				self.logger.info('post %s init %+d reply.',post.locate_id,cnt)
			else:
				self.logger.info('post %s init %+d reply, != expect %d',post.locate_id,cnt,post.nr_reply+1)
			post.nr_reply=cnt-1 if cnt-1>=0 else 0 # 设为实际获得值-1,以便下次再次尝试查找新增帖子
		post.save()
		self.stat_reply_add+=cnt
Пример #3
0
	def getOnePostSmart(self,post,from_page=1):
		'''获取页面中的帖子。根据页面中的“下一页”链接是否存在来决定是否结束翻页,比根据主题列表中显示的回帖数决定翻页范围要更准确'''
		cnt=0
		sr=urlparse.urlsplit(post.url)
		if not sr.scheme:
			posturl=post.sector.base_url+post.url
		else:
			posturl=post.url

		posturl+='&TopicPage=%d'
		posturl=posturl.replace('topicdisplay.asp','topicdisplay_safe.asp')

		parser=etree.HTMLParser()
		pg=from_page
		while True:
			if self.exitevent.is_set():
				self.logger.info('got exit signal!')
				break
			self.logger.debug('post %s %s ...',post.locate_id,posturl%pg)
			data=self._getData(posturl%pg,None,post.title)

			tree=etree.fromstring(data,parser)

			posts=tree.xpath('//table[@class="maintable"][1]/tr[position()>1]')
			haspostinpage=False
			gotfuckingword=False
			for item in posts:
				haspostinpage=True
				try:
					author=item[0].xpath('a/text()')[0]
				except IndexError:
					self.logger.debug('no author info?')
					author=''

					if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in  htmlentitydecode(etree.tostring(item)):
						self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item)))
						gotfuckingword=True
						continue

				try:
					crt_date=item[1].xpath('font/text()')[0][6:-1]
					# 判断是否是主题贴,因为主题贴无法获得replyid
					tmp=item[1].xpath('*[position()<5]')
					if len(tmp)==4 and [x.tag for x in tmp]==['b','hr','font','hr']: # 是主题贴
						replyid=0
						try:
							realtitle=item[1].xpath('b/text()')[0]
						except IndexError:
							try:
								realtitle=item[1].xpath('b/font/b/text()')[0]
							except IndexError:
								realtitle=item[1].xpath('b/font/text()')[0]

						if post.title!=realtitle:
							self.logger.debug('post %s realtitle %s',post.locate_id,realtitle)
							post.title=realtitle
						# 为后面获取回复内容而删除非回复信息(主题/分割线/发表时间等)
						for x in item[1].xpath('*[position()<5]'):
							item[1].remove(x)
						item[1].text=''
					else: # 非主题贴
						replyurl=item[1].xpath('font/a[1]/@href')[0]
						replyid=re.search('ReplyID=(\d+)',replyurl).group(1)
						# 为后面获取回复内容而删除非回复信息(分割线/发表时间等)
						for x in item[1].xpath('*[position()<3]'):
							item[1].remove(x)

					replycontent=self.exclude_first_td_tag.match(htmlentitydecode(etree.tostring(item[1])).strip()).group(1)
					if replycontent.startswith('<br/>'):
						replycontent=replycontent[5:]

					if author=='':
						author='usr for %d-%s'%(post.id,replyid)

	##					open('/home/kevin/tmp_post.txt','w').write(replycontent)
					try:
						r=Reply.objects.filter(post=post,locate_id=replyid)[0:1].get()
					except Reply.DoesNotExist:
						r=Reply(post=post,locate_id=replyid,crt_date=crt_date,author=author,content=replycontent)
						try:
							r.save()
						except _mysql_exceptions.Warning:
							self.logger.debug('got _mysql_exceptions.Warning!')
							r.content=self.exclude_first_td_tag.match(etree.tostring(item[1]).strip()[:-5]).group(1)
							if replycontent.startswith('<br/>'):
								replycontent=replycontent[5:]
							r.save()

						cnt+=1
				except IndexError:
					if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in  htmlentitydecode(etree.tostring(item)):
						self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item)))
						gotfuckingword=True
						continue
					else:
						raise
				except AttributeError:
					if '浏览的页面或提交的数据包含敏感关键词信息,该关键词已经被过滤' in  htmlentitydecode(etree.tostring(item)):
						self.logger.info('got page contains words! %s ',htmlentitydecode(etree.tostring(item)))
						gotfuckingword=True
						continue
					else:
						raise

			# check page next
			x=tree.xpath('//td[@class="outtd"][1]/table[2]/tr[1]/td[2]/a')
			if ('[>]' in [t.text for t in x]) and haspostinpage:
				pg+=1
			elif gotfuckingword: #
				pg+=1
			else:
				break


		self.logger.debug('post %s %+d reply',post.locate_id,cnt)
		# 检查实际获得数量
		actualcnt=Reply.objects.filter(post=post).count()
		self.logger.debug('post %s actual %d reply in DB',post.locate_id,actualcnt)
		post.nr_reply=actualcnt-1 if actualcnt-1>=0 else 0
		post.save()
Пример #4
0
	def getPost(self,post,new_nr=0):
		cnt=0
		sr=urlparse.urlsplit(post.url)
		if not sr.scheme:
##			posturl=urlparse.urljoin(post.sector.base_url,post.url)
			posturl=urlparse.urljoin(post.sector.base_url, 'showtopic.aspx?topicid=%s&page=%%d'%post.locate_id)
		else:
			posturl=post.url

		startpage,stoppage=self.getPageRange(self.NR_REPLY_PER_PAGE,post.nr_reply,new_nr)

		self.logger.debug('page range [%d,%d) for post %s',startpage,stoppage,post.locate_id)
		parser=etree.HTMLParser()
		for pg in xrange(startpage,stoppage):
			self.logger.debug('post %s %s ...',post.locate_id,posturl%pg)
			data=self._getData(posturl%pg,None,post.title)
			tree=etree.fromstring(data,parser)

			pl=tree.xpath('//div[@id="postsContainer"]/table')

			for p in pl:
				if p.xpath('@class') and p.xpath('@class')[0]=='plh':
					realtitle=p.xpath('./tbody/tr/td[@class="posttopic"]/h1[@class="ts z"]/span/text()')[0]
					if realtitle and post.title!=realtitle :
						self.logger.debug('realtitle for post %s|%s',post.locate_id,realtitle)
						post.title=realtitle

					continue

				author=p.xpath('./tbody[1]/tr[1]/td[@class="postauthor"]/div[@class="poster"]/span/text()')[0]
				replyid=p.xpath('./@id')[0]
				if p.xpath('//div[@id="message%s"]/div[@id="firstpost"]'%replyid):
					replycontent=htmlentitydecode(etree.tostring(p.xpath('//div[@id="message%s"]/div[@id="firstpost"]'%replyid)[0])).strip()
				else:
					replycontent=htmlentitydecode(etree.tostring(p.xpath('//div[@id="message%s"]'%replyid)[0])).strip()
				replycontent=self.exclude_first_div_tag.match(replycontent).group(1).strip()

##				crt_date=p.xpath('./tbody[1]/tr[1]/td[@class="postcontent"]/div[@class="pi"]/div[@class="postinfo"]/em/span/@title')[0]
				crt_date=p.xpath('//div[@class="postinfo"]/em/span/@title')[0]

				try:
					r=Reply.objects.filter(post=post, author=author, locate_id=replyid)[0:1].get()
				except Reply.DoesNotExist:
					r=Reply(post=post,locate_id=replyid,crt_date=crt_date,author=author,content=replycontent)
					r.save()
					cnt+=1

##		debug('post %s add %d reply.',post.locate_id,cnt)
		if new_nr!=0:
			if post.nr_reply+cnt==new_nr:
				self.logger.info('post %s %+d reply. now %d',post.locate_id,cnt,new_nr+1)
				post.nr_reply+=cnt # 增加实际变化(新增)的数量
			else:
				self.logger.debug('post %s %+d reply, %d != expect %d',post.locate_id,cnt,post.nr_reply+cnt,new_nr)
				# 检查实际获得数量
				actualcnt=Reply.objects.filter(post=post).count()
				self.logger.info('post %s actual %d reply in DB',post.locate_id,actualcnt)
				post.nr_reply=actualcnt-1 if actualcnt-1>=0 else 0
		else:
			if post.nr_reply+1==cnt:
				self.logger.info('post %s init %+d reply.',post.locate_id,cnt)
			else:
				self.logger.info('post %s init %+d reply, != expect %d',post.locate_id,cnt,post.nr_reply+1)
			post.nr_reply=cnt-1 if cnt-1>=0 else 0 # 设为实际获得值-1,以便下次再次尝试查找新增帖子
		post.save()