Python rfindall 예제들, re.rfindall Python 예제들

예제 #1

0

파일 보기

	def get_visitors(self, account):
		'Get all visitors who left comments or likes etc. in timeline - timeline has to be open end expand'
		self.get_timeline(account)
		visitors = []	# list to store links to other profiles
		visitor_ids = {account['id']}	# create set to store facebook ids of visitors to get uniq visitors
		items = self.chrome.get_outer_html('ClassName', 'commentable_item')	# get commentable items
		for i in items:
			for j in rfindall('<a class="[^"]+" data-hovercard="/ajax/hovercard/user\.php\?id=[^"]+" href="[^"]+"[^>]*>[^<]+</a>', i):	# get comment authors
				visitor = self.link2account(j)
				if not visitor['id'] in visitor_ids:	# uniq
					visitors.append(visitor)
					visitor_ids.add(visitor['id'])
			href = self.ct.search('href="/ufi/reaction/profile/browser/[^"]+', i)		# get reactions
			if href != None:
				if self.chrome.stop_check():
					return
				self.navigate('https://www.facebook.com' + href[6:])	# open reaction page
				self.chrome.expand_page(terminator=self.terminator)	# scroll through page
				self.rm_pagelets()	# remove bluebar etc.
				html = self.chrome.get_inner_html_by_id('content')	# get the necessary part of the page
				for j in rfindall(
					' href="https://www\.facebook\.com/[^"]+" data-hovercard="/ajax/hovercard/user\.php\?id=[^"]+" data-hovercard-prefer-more-content-show="1"[^<]+</a>',
					html
				):
					visitor = self.link2account(j)
					if visitor != None and not visitor['id'] in visitor_ids:	# uniq
						visitors.append(visitor)
						visitor_ids.add(visitor['id'])
		self.storage.write_2d([ [ i[j] for j in self.ACCOUNT ] for i in visitors ], account['path'], 'visitors.csv')
		self.storage.write_json(visitors, account['path'], 'visitors.json')
		return { i['path'] for i in visitors }	# return visitors ids as set

예제 #2

0

파일 보기

파일: facebook.py 프로젝트: zammalhabe/somedo

 def get_friends(self, account):
     'Get friends list from given user (id or path)'
     if account['type'] == 'profile':
         self.navigate('%s/friends' % account['link'])
         path_no_ext = self.storage.modpath(account['path'], 'friends')
         self.rm_pagelets()  # remove bluebar etc.
         self.rm_left()
         self.chrome.expand_page(
             path_no_ext=path_no_ext
         )  # no limit for friends - it makes no sense not getting all friends
         self.chrome.page_pdf(path_no_ext)
         html = self.chrome.get_inner_html_by_id(
             'pagelet_timeline_medley_friends')  # try to get friends
         if html == None:
             return []  # return empty list if no visible friends
         flist = []  # list to store friends
         for i in rfindall(
                 ' href="https://www\.facebook\.com\/[^<]+=friends_tab" [^<]+</a>',
                 html):  # get the links to friends
             friend = self.link2account(i)
             if friend != None:
                 flist.append(
                     friend)  # append to friend list if info was extracted
         self.storage.write_2d([[i[j] for j in self.ACCOUNT]
                                for i in flist], account['path'],
                               'friends.csv')
         self.storage.write_json(flist, account['path'], 'friends.json')
         return {i['path'] for i in flist}  # return friends as set
     if account['type'] == 'groups':
         self.navigate('%s/members' % account['link'])
         path_no_ext = self.storage.modpath(account['path'], 'members')
         self.rm_pagelets()  # remove bluebar etc.
         self.rm_right()
         self.chrome.set_x_left()
         self.chrome.expand_page(
             path_no_ext=path_no_ext
         )  # no limit for friends - it makes no sense not getting all friends
         self.rm_left()
         self.chrome.page_pdf(path_no_ext)
         html = self.chrome.get_inner_html_by_id(
             'groupsMemberBrowser')  # try to get members
         if html == None:
             return []  # return empty list if no visible friends
         mlist = []  # list to store friends
         for i in rfindall(
                 ' href="https://www\.facebook\.com\/[^<]+location=group" [^<]+</a>',
                 html):  # regex vs facebook
             member = self.link2account(i)
             if member != None:
                 mlist.append(
                     member)  # append to friend list if info was extracted
         self.storage.write_2d([[i[j] for j in self.ACCOUNT]
                                for i in mlist], account['path'],
                               'members.csv')
         self.storage.write_json(mlist, account['path'], 'members.json')
         return {i['path'] for i in mlist}  # return members as set
     return set()

예제 #3

0

파일 보기

파일: facebook.py 프로젝트: zammalhabe/somedo

    def get_timeline_posts(self, account):
        'Get post after post from timeline'
        html = self.chrome.get_inner_html_by_id(
            'structured_composer_async_container')
        if html != None:
            cnt = 1
            for i in rfindall('<article class="[^"]+" data-store="[^"]+"',
                              html):
                key = self.ct.search('mf_story_key\.[0-9]+', i)
                if key == None:
                    continue
                pid = key[13:]
                url = 'https://m.facebook.com/story.php?story_fbid=%s&id=%s' % (
                    pid, account['id'])
                path_no_ext = self.storage.modpath(
                    account['path'], 'post_%05d_pid_%s' % (cnt, pid))
                cnt += 1
                self.logger.debug('Facebook: Writing %s to files %s.*' %
                                  (url, path_no_ext))
                self.navigate(url)
                #				self.chrome.expand_page()
                #				self.rm_m_composer(pid)
                #				self.chrome.entire_page_png(path_no_ext)

                self.chrome.expand_page(
                    path_no_ext=path_no_ext,
                    limit=self.options['limitTimeline'],
                    per_page_action=self.rm_composer_code(pid))

                self.chrome.page_pdf(path_no_ext)
                if self.options['extendNetwork']:
                    html = self.chrome.get_inner_html_by_id('ufi_%s' % pid)

예제 #4

0

파일 보기

파일: instagram.py 프로젝트: ask4eazy/somedo

	def get_links(self):
		'Extract links from tag "article"'
		try:
			for i in rfindall('<a href="/p/[^"]+', self.chrome.get_outer_html('TagName', 'article')[0]):	# go through links
				if not i[9:] in self.links:
					self.links.append(i[9:])
		except:
			pass

예제 #5

0

파일 보기

	def click_translations(self):
		'Find the See Translation buttons and click'
		html = self.chrome.get_inner_html_by_id('recent_capsule_container')
		if html == None:
			html = self.chrome.get_inner_html_by_id('pagelet_timeline_main_column')
		if html == None:
			html = self.chrome.get_inner_html_by_id('pagelett_group_mall')
		if html == None:
			return
		for i in rfindall('<span id="translationSpinnerPlaceholder_[^"]+"', html):
			self.chrome.click_element_by_id(i[10:-1])

예제 #6

0

파일 보기

파일: facebook.py 프로젝트: zammalhabe/somedo

 def get_photos(self, account):
     'Get Photos'
     if account['type'] == 'pg':
         self.navigate('https://www.facebook.com/pg/%s/photos' %
                       account['path'])
     elif account['type'] == 'groups':
         self.navigate(account['link'] + '/photos')
     else:
         self.navigate(account['link'] + '/photos_all')
     path_no_ext = self.storage.modpath(account['path'], 'photos')
     self.rm_pagelets()  # remove bluebar etc.
     self.rm_right()
     self.expand_page(path_no_ext=path_no_ext,
                      limit=self.options['limitPhotos'])
     self.rm_left()
     self.chrome.page_pdf(path_no_ext)
     cnt = 1  # to number screenshots
     if account['type'] == 'pg':
         html = self.chrome.get_inner_html_by_id('content_container')
         if html != None:
             for i in rfindall(
                     '<a href="https://www\.facebook\.com/[^"]+/photos/[^"]+" rel="theater">',
                     html):
                 if self.chrome.stop_check():
                     return
                 self.navigate(i[9:-16])
                 self.chrome.rm_outer_html_by_id(
                     'photos_snowlift')  # show page with comments
                 path_no_ext = self.storage.modpath(account['path'],
                                                    '%05d_photo' % cnt)
                 self.rm_pagelets()  # remove bluebar etc.
                 self.expand_page(path_no_ext=path_no_ext,
                                  limit=self.options['limitPhotos'],
                                  expand=self.options['expandPhotos'],
                                  translate=self.options['translatePhotos'])
                 self.chrome.page_pdf(path_no_ext)
                 try:
                     self.storage.download(
                         self.ct.src(
                             self.chrome.get_outer_html(
                                 'ClassName',
                                 'scaledImageFitWidth img')[0]),
                         account['path'], '%05d_image.jpg' % cnt)
                 except:
                     pass
                 cnt += 1
                 if cnt == 100000:
                     break
                 self.chrome.go_back()
     elif account['type'] == 'groups':
         html = self.chrome.get_inner_html_by_id('pagelet_group_photos')
         if html != None:
             for i in rfindall(
                     ' href="https://www.facebook.com/photo\.php\?[^"]+',
                     html):
                 if self.chrome.stop_check():
                     return
                 self.navigate(i[7:])
                 self.chrome.rm_outer_html_by_id(
                     'photos_snowlift')  # show page with comments
                 path_no_ext = self.storage.modpath(account['path'],
                                                    '%05d_photo' % cnt)
                 self.rm_pagelets()  # remove bluebar etc.
                 self.expand_page(path_no_ext=path_no_ext,
                                  limit=self.options['limitPhotos'],
                                  expand=self.options['expandPhotos'],
                                  translate=self.options['translatePhotos'])
                 self.chrome.page_pdf(path_no_ext)
                 try:
                     self.storage.download(
                         self.ct.src(
                             self.chrome.get_outer_html(
                                 'ClassName',
                                 'scaledImageFitWidth img')[0]),
                         account['path'], '%05d_image.jpg' % cnt)
                 except:
                     pass
                 cnt += 1
                 if cnt == 100000:
                     break
                 self.chrome.go_back()
     else:
         html = self.chrome.get_inner_html_by_id(
             'pagelet_timeline_medley_photos')
         if html != None:
             for i in rfindall(
                     'ajaxify="https://www\.facebook\.com/photo\.php?[^"]*"',
                     html):  # loop through photos
                 if self.chrome.stop_check():
                     return
                 self.navigate(i[9:-1])
                 self.chrome.rm_outer_html_by_id(
                     'photos_snowlift')  # show page with comments
                 path_no_ext = self.storage.modpath(account['path'],
                                                    '%05d_photo' % cnt)
                 self.rm_pagelets()  # remove bluebar etc.
                 self.expand_page(path_no_ext=path_no_ext,
                                  limit=self.options['limitPhotos'],
                                  expand=self.options['expandPhotos'],
                                  translate=self.options['translatePhotos'])
                 self.chrome.page_pdf(path_no_ext)
                 try:
                     self.storage.download(
                         self.ct.src(
                             self.chrome.get_outer_html(
                                 'ClassName',
                                 'scaledImageFitWidth img')[0]),
                         account['path'], '%05d_image.jpg' % cnt)
                 except:
                     pass
                 cnt += 1
                 if cnt == 100000:
                     break
                 self.chrome.go_back()