def get_visitors(self, account): 'Get all visitors who left comments or likes etc. in timeline - timeline has to be open end expand' self.get_timeline(account) visitors = [] # list to store links to other profiles visitor_ids = {account['id']} # create set to store facebook ids of visitors to get uniq visitors items = self.chrome.get_outer_html('ClassName', 'commentable_item') # get commentable items for i in items: for j in rfindall('<a class="[^"]+" data-hovercard="/ajax/hovercard/user\.php\?id=[^"]+" href="[^"]+"[^>]*>[^<]+</a>', i): # get comment authors visitor = self.link2account(j) if not visitor['id'] in visitor_ids: # uniq visitors.append(visitor) visitor_ids.add(visitor['id']) href = self.ct.search('href="/ufi/reaction/profile/browser/[^"]+', i) # get reactions if href != None: if self.chrome.stop_check(): return self.navigate('https://www.facebook.com' + href[6:]) # open reaction page self.chrome.expand_page(terminator=self.terminator) # scroll through page self.rm_pagelets() # remove bluebar etc. html = self.chrome.get_inner_html_by_id('content') # get the necessary part of the page for j in rfindall( ' href="https://www\.facebook\.com/[^"]+" data-hovercard="/ajax/hovercard/user\.php\?id=[^"]+" data-hovercard-prefer-more-content-show="1"[^<]+</a>', html ): visitor = self.link2account(j) if visitor != None and not visitor['id'] in visitor_ids: # uniq visitors.append(visitor) visitor_ids.add(visitor['id']) self.storage.write_2d([ [ i[j] for j in self.ACCOUNT ] for i in visitors ], account['path'], 'visitors.csv') self.storage.write_json(visitors, account['path'], 'visitors.json') return { i['path'] for i in visitors } # return visitors ids as set
def get_friends(self, account): 'Get friends list from given user (id or path)' if account['type'] == 'profile': self.navigate('%s/friends' % account['link']) path_no_ext = self.storage.modpath(account['path'], 'friends') self.rm_pagelets() # remove bluebar etc. self.rm_left() self.chrome.expand_page( path_no_ext=path_no_ext ) # no limit for friends - it makes no sense not getting all friends self.chrome.page_pdf(path_no_ext) html = self.chrome.get_inner_html_by_id( 'pagelet_timeline_medley_friends') # try to get friends if html == None: return [] # return empty list if no visible friends flist = [] # list to store friends for i in rfindall( ' href="https://www\.facebook\.com\/[^<]+=friends_tab" [^<]+</a>', html): # get the links to friends friend = self.link2account(i) if friend != None: flist.append( friend) # append to friend list if info was extracted self.storage.write_2d([[i[j] for j in self.ACCOUNT] for i in flist], account['path'], 'friends.csv') self.storage.write_json(flist, account['path'], 'friends.json') return {i['path'] for i in flist} # return friends as set if account['type'] == 'groups': self.navigate('%s/members' % account['link']) path_no_ext = self.storage.modpath(account['path'], 'members') self.rm_pagelets() # remove bluebar etc. self.rm_right() self.chrome.set_x_left() self.chrome.expand_page( path_no_ext=path_no_ext ) # no limit for friends - it makes no sense not getting all friends self.rm_left() self.chrome.page_pdf(path_no_ext) html = self.chrome.get_inner_html_by_id( 'groupsMemberBrowser') # try to get members if html == None: return [] # return empty list if no visible friends mlist = [] # list to store friends for i in rfindall( ' href="https://www\.facebook\.com\/[^<]+location=group" [^<]+</a>', html): # regex vs facebook member = self.link2account(i) if member != None: mlist.append( member) # append to friend list if info was extracted self.storage.write_2d([[i[j] for j in self.ACCOUNT] for i in mlist], account['path'], 'members.csv') self.storage.write_json(mlist, account['path'], 'members.json') return {i['path'] for i in mlist} # return members as set return set()
def get_timeline_posts(self, account): 'Get post after post from timeline' html = self.chrome.get_inner_html_by_id( 'structured_composer_async_container') if html != None: cnt = 1 for i in rfindall('<article class="[^"]+" data-store="[^"]+"', html): key = self.ct.search('mf_story_key\.[0-9]+', i) if key == None: continue pid = key[13:] url = 'https://m.facebook.com/story.php?story_fbid=%s&id=%s' % ( pid, account['id']) path_no_ext = self.storage.modpath( account['path'], 'post_%05d_pid_%s' % (cnt, pid)) cnt += 1 self.logger.debug('Facebook: Writing %s to files %s.*' % (url, path_no_ext)) self.navigate(url) # self.chrome.expand_page() # self.rm_m_composer(pid) # self.chrome.entire_page_png(path_no_ext) self.chrome.expand_page( path_no_ext=path_no_ext, limit=self.options['limitTimeline'], per_page_action=self.rm_composer_code(pid)) self.chrome.page_pdf(path_no_ext) if self.options['extendNetwork']: html = self.chrome.get_inner_html_by_id('ufi_%s' % pid)
def get_links(self): 'Extract links from tag "article"' try: for i in rfindall('<a href="/p/[^"]+', self.chrome.get_outer_html('TagName', 'article')[0]): # go through links if not i[9:] in self.links: self.links.append(i[9:]) except: pass
def click_translations(self): 'Find the See Translation buttons and click' html = self.chrome.get_inner_html_by_id('recent_capsule_container') if html == None: html = self.chrome.get_inner_html_by_id('pagelet_timeline_main_column') if html == None: html = self.chrome.get_inner_html_by_id('pagelett_group_mall') if html == None: return for i in rfindall('<span id="translationSpinnerPlaceholder_[^"]+"', html): self.chrome.click_element_by_id(i[10:-1])
def get_photos(self, account): 'Get Photos' if account['type'] == 'pg': self.navigate('https://www.facebook.com/pg/%s/photos' % account['path']) elif account['type'] == 'groups': self.navigate(account['link'] + '/photos') else: self.navigate(account['link'] + '/photos_all') path_no_ext = self.storage.modpath(account['path'], 'photos') self.rm_pagelets() # remove bluebar etc. self.rm_right() self.expand_page(path_no_ext=path_no_ext, limit=self.options['limitPhotos']) self.rm_left() self.chrome.page_pdf(path_no_ext) cnt = 1 # to number screenshots if account['type'] == 'pg': html = self.chrome.get_inner_html_by_id('content_container') if html != None: for i in rfindall( '<a href="https://www\.facebook\.com/[^"]+/photos/[^"]+" rel="theater">', html): if self.chrome.stop_check(): return self.navigate(i[9:-16]) self.chrome.rm_outer_html_by_id( 'photos_snowlift') # show page with comments path_no_ext = self.storage.modpath(account['path'], '%05d_photo' % cnt) self.rm_pagelets() # remove bluebar etc. self.expand_page(path_no_ext=path_no_ext, limit=self.options['limitPhotos'], expand=self.options['expandPhotos'], translate=self.options['translatePhotos']) self.chrome.page_pdf(path_no_ext) try: self.storage.download( self.ct.src( self.chrome.get_outer_html( 'ClassName', 'scaledImageFitWidth img')[0]), account['path'], '%05d_image.jpg' % cnt) except: pass cnt += 1 if cnt == 100000: break self.chrome.go_back() elif account['type'] == 'groups': html = self.chrome.get_inner_html_by_id('pagelet_group_photos') if html != None: for i in rfindall( ' href="https://www.facebook.com/photo\.php\?[^"]+', html): if self.chrome.stop_check(): return self.navigate(i[7:]) self.chrome.rm_outer_html_by_id( 'photos_snowlift') # show page with comments path_no_ext = self.storage.modpath(account['path'], '%05d_photo' % cnt) self.rm_pagelets() # remove bluebar etc. self.expand_page(path_no_ext=path_no_ext, limit=self.options['limitPhotos'], expand=self.options['expandPhotos'], translate=self.options['translatePhotos']) self.chrome.page_pdf(path_no_ext) try: self.storage.download( self.ct.src( self.chrome.get_outer_html( 'ClassName', 'scaledImageFitWidth img')[0]), account['path'], '%05d_image.jpg' % cnt) except: pass cnt += 1 if cnt == 100000: break self.chrome.go_back() else: html = self.chrome.get_inner_html_by_id( 'pagelet_timeline_medley_photos') if html != None: for i in rfindall( 'ajaxify="https://www\.facebook\.com/photo\.php?[^"]*"', html): # loop through photos if self.chrome.stop_check(): return self.navigate(i[9:-1]) self.chrome.rm_outer_html_by_id( 'photos_snowlift') # show page with comments path_no_ext = self.storage.modpath(account['path'], '%05d_photo' % cnt) self.rm_pagelets() # remove bluebar etc. self.expand_page(path_no_ext=path_no_ext, limit=self.options['limitPhotos'], expand=self.options['expandPhotos'], translate=self.options['translatePhotos']) self.chrome.page_pdf(path_no_ext) try: self.storage.download( self.ct.src( self.chrome.get_outer_html( 'ClassName', 'scaledImageFitWidth img')[0]), account['path'], '%05d_image.jpg' % cnt) except: pass cnt += 1 if cnt == 100000: break self.chrome.go_back()