def crawl_friends(self, targeturl, callback): count = 0 # load the friends page friendsurl = join_url(targeturl, page_references.get('friends_page')) self.load(friendsurl) while True: all_friends = self.driver.find_elements_by_css_selector(css_selectors.get('friends_selector')) # break if no more friends if len(all_friends) <= count: break for friend in all_friends[count:]: if self.stop_request: return count count += 1 friend_info = friend.find_element_by_xpath(xpath_selectors.get('friend_info')) name = friend_info.text url = friend_info.get_attribute('href') imgurl = friend.find_element_by_css_selector(css_selectors.get('friend_image')).get_attribute('src') callback(name, url, imgurl, count) self.scroll_to_bottom(wait=True) return count
def crawl_likes(self, targeturl, callback): """For each like of a target, execute callback. Callback format: Liked Page Name, Liked Page URL, count """ count = 0 # load the likes page likesurl = join_url(targeturl, page_references.get('likes_page')) self.load(likesurl) while True: all_likes = self.driver.find_elements_by_xpath(xpath_selectors.get('likes_selector')) # break if no more likes if len(all_likes) <= count: break for like in all_likes[count:]: if self.stop_request: return count count += 1 callback(like.text, like.get_attribute('href'), count) self.scroll_to_bottom(wait=True) return count
def crawl_albums(self, targeturl, callback): # scrape all albums self.load(join_url(targeturl, page_references.get('albums'))) albums = self.driver.find_elements_by_css_selector(css_selectors.get('indiv_albums')) count = 0 for album_name, album_url in [(a.text, a.get_attribute('href')) for a in albums]: if self.stop_request: return count count += 1 callback(album_name, album_url, count) return count
def crawl_about(self, targeturl, callback): self.load(join_url(targeturl, page_references.get('about_page'))) about_links = self.driver.find_elements_by_css_selector(css_selectors.get('about_links')) count = 0 for l in about_links: if self.stop_request: return count l.click() self.delay() title = l.get_attribute('title') main_pane = self.driver.find_element_by_css_selector(css_selectors.get('about_main')) callback(title, main_pane.text) count += 1 return count
def crawl_checkins(self, targeturl, callback): """Callback format: check_in_name, check_in_url, count """ self.load(join_url(targeturl, page_references.get('checkins'))) count = 0 while True: # get groups, break if no more groups checkins = self.driver.find_elements_by_css_selector(css_selectors.get('checkins')) if len(checkins) <= count: break # extract check in info for p in checkins: if self.stop_request: return count count += 1 callback(p.text, p.get_attribute('href'), count) self.scroll_to_bottom(wait=True) return count
def crawl_groups(self, targeturl, callback): """Callback format: group_name, group_url, count """ self.load(join_url(targeturl, page_references.get('groups_page'))) count = 0 while True: # get groups, break if no more groups groups = self.driver.find_elements_by_css_selector(css_selectors.get('groups')) if len(groups) <= count: break # extract group info for g in groups: if self.stop_request: return count count += 1 callback(g.text, g.get_attribute('href'), count) self.scroll_to_bottom(wait=True) return count
def crawl_photos(self, targeturl, callback): """Callback format: photo_source_url, photo_description, photo_post_permalink, count The description is not actually the description that was posted with the photo but an automatically generated description by Facebook's algorithms. It can detect objects that might be contained within the image for example: "trees, person, smiling" could be a description. """ albumurl = join_url(targeturl, page_references.get('photos_page')) self.load(albumurl) count = 0 while True: all_photos = self.driver.find_elements_by_css_selector(css_selectors.get('photo_selector')) # break if no more photos if len(all_photos) <= count: break for p in all_photos[count:]: if self.stop_request: return count # url of the image photo_source_url = p.get_attribute('data-starred-src') # get the metadata and store that too link = p.find_element_by_css_selector('a') photo_description = link.get_attribute('aria-label') # get the permalink url of the image post photo_post_permalink = link.get_attribute('href') count += 1 callback(photo_source_url, photo_description, photo_post_permalink, count) self.scroll_to_bottom(wait=True) return count