Пример #1
0
	def friends(self):
		cnt = 0
		for renrenId in os.listdir(self.browser.getPWDFriendPage()):
			print("parsing {}: {}".format(cnt, renrenId));
			cnt = cnt+1
			pwd = self.browser.getPWDFriendPage()+'/'+renrenId+'/'
			#parsered pages and assign to flist 
			pages = os.listdir(pwd)
			flist = set()
			#files that parsering, store for rename later
			parsering = []
			for page in pages:
				if page.find('parsered_')==0:
					#file parserd, continue
					continue
				else:
					parsering.append(page)
					flist = flist| self.friendPage(pwd+page)

			if len(pages)==0:
				#if empty, mkdir flag file and assign 1 to flist
				open(pwd+'parsered_{}_noPermision.html'.format(renrenId)).close()
				#os.mknod(pwd+'parsered_{}_noPermision.html'.format(renrenId))
				flist = {('1', 'unavailable')}
			elif len(flist)==0:
				#all files parser, continue
				continue
			#else:
			if self.recorder==None:
				#insert into table, pairs>temp_profile, relation>temp_relation
				db = RenrenDb()
				sqlProfile = 'insert into {} (renrenId,name) values {}'.format(db.temp_profile, str(flist).strip('{}'))
				relation = ''
				for pair in flist:
					relation = relation+'({},{}),'.format(renrenId, str(pair[0]))
				sqlRelation = 'insert into {} (renrenId1,renrenId2) values {}'.format(db.temp_relation, relation.strip(','))
				conn = db.getConn()
				cur = conn.cursor()
				m = cur.execute(sqlProfile)
				n = cur.execute(sqlRelation)
				#self.log.info('{} profiles and {} relations of {} inserted into db'.format(m,n,renrenId))
				conn.commit()
				cur.close()
				conn.close()
			else:
				self.recorder.addNames(flist)
				friends = set()
				for pair in flist:
					friends = friends | {str(pair[0])}
				self.recorder.addFriends(renrenId, friends)
			
			#rename parsering files
			for old in parsering:
				new = 'parsered_'+old
				os.rename(pwd+old, pwd+new)
Пример #2
0
	def statusPage(self,filename,mainId=None):
		#open and read 
		f=open(filename, 'r', encoding='utf-8')
		htmlStr=str(f.readlines())
		f.close()

		#parser all id/name pairs from profile urls
		#itemPtn=r'<a\shref=\"http://www.renren.com/profile.do\?id=\d+\">[^<]+<\/a>'
		itemPtn=r'id="status-.+?ilike_icon'
		itemList=re.compile(itemPtn).findall(htmlStr)
		status=''
		for item in itemList:
			#statusId, content, duration, replycount
			statusIdPtn = r'"status-\d+">'
			statusId = (re.compile(statusIdPtn)).findall(item)[0].replace('status','').strip('-=<>"')

			contentPtn=r'<h3>.+</h3>'
			contentDraft=re.compile(contentPtn).findall(item)[0]
			#simplify content
			#no @ href
			atHrefPtn=r"<a\s+href=\\'http://www.renren.com/g/\d+.+?>"
			hrefs=re.compile(atHrefPtn).findall(contentDraft)
			for href in hrefs:
				#print(href)
				renrenId=re.compile(r'\d+').findall(href)[0]
				contentDraft=contentDraft.replace(href,'(renrenId={})'.format(renrenId))
			#no normal profile href
			profileHrefPtn=r'<a\shref="http://www.renren.com/profile.do\?id=.+?>'
			profile=re.compile(profileHrefPtn).findall(contentDraft)
			for href in profile:
				#print(href)
				renrenId=re.compile(r'\d+').findall(href)[0]
				contentDraft=contentDraft.replace(href,'(renrenId={})'.format(renrenId))
			#no public profile href
			profileHrefPtn=r'<a\shref="http://page.renren.com/.+?>'
			profile=re.compile(profileHrefPtn).findall(contentDraft)
			for href in profile:
				#print(href)
				renrenId=re.compile(r'\d+').findall(href)[0]
				contentDraft=contentDraft.replace(href,'(renrenId={})'.format(renrenId))
			#no alt img
			imgPtn=r"<img\s+src=.+?alt=\\'.+?\\'.+?/>"
			imgs=re.compile(imgPtn).findall(contentDraft)
			for img in imgs:
				#print(img)
				imgDesc=re.compile(r"\\'.+?\\").findall(img)[1].strip("=\\'")
				contentDraft=contentDraft.replace(img,'(img={})'.format(imgDesc))
			#no img in name
			nameImgPtn=r"<img class=.+?alt=.+?http://a.xnimg.cn/.+?/>"
			nameImg=re.compile(nameImgPtn).findall(contentDraft)
			for img in nameImg:
				#print(img)
				contentDraft=contentDraft.replace(img,'')
				#print(img,imgDesc)
			content = (re.compile(r'>:.+</')).findall(contentDraft)[0].replace('</a>','').replace('\\n','').strip("<>/:',\\ ")

			timePtn=r'"duration">.+?\\n'
			time=re.compile(timePtn).findall(item)[0]
			timestamp=re.compile(r'>.+\\n').findall(time)[0].strip('=<>"\\n')

			replyCountPtn=r'replyCount\d+">\(\d+\)'
			replyCountDraft=re.compile(replyCountPtn).findall(item)
			if len(replyCountDraft)<1:
				replyCount='0'
			else:
				replyCount=re.compile(r'\(\d+\)').findall(replyCountDraft[0])[0].strip('()')
			#status.append({'statusId':statusId,'timestamp':timestamp,'replyCount':replyCount,'content':content})
			#status={'statusId':statusId,'timestamp':timestamp,'replyCount':replyCount}
			status=status+",('{}','{}','{}','{}','{}')".format(statusId,mainId,content.replace("'",'"'),replyCount,timestamp)
		#TODO: add none sql code
		sql="INSERT INTO temp_status(statusId,renrenId,content,replyCount,timestamp) values {}".format(status.strip(','))
		#print(sql)
		db=RenrenDb()
		conn=db.getConn()
		cur=conn.cursor()
		try:
			n=cur.execute(sql)
			conn.commit()
		except Exception:
			print('error. filename={}'.format(filename))
			print(sql)
			n=0
		cur.close()
		conn.close()
		return n