示例#1
0
    def crawl_friends(self, targeturl, callback):
        count = 0
        # load the friends page
        friendsurl = join_url(targeturl, page_references.get('friends_page'))
        self.load(friendsurl)

        while True:
            all_friends = self.driver.find_elements_by_css_selector(css_selectors.get('friends_selector'))
            # break if no more friends
            if len(all_friends) <= count:
                break

            for friend in all_friends[count:]:
                if self.stop_request:
                    return count
                count += 1
                friend_info = friend.find_element_by_xpath(xpath_selectors.get('friend_info'))
                name = friend_info.text
                url = friend_info.get_attribute('href')
                imgurl = friend.find_element_by_css_selector(css_selectors.get('friend_image')).get_attribute('src')
                callback(name, url, imgurl, count)

            self.scroll_to_bottom(wait=True)

        return count
示例#2
0
    def crawl_likes(self, targeturl, callback):
        """For each like of a target, execute callback.
        Callback format: Liked Page Name, Liked Page URL, count
        """
        count = 0
        # load the likes page
        likesurl = join_url(targeturl, page_references.get('likes_page'))
        self.load(likesurl)

        while True:
            all_likes = self.driver.find_elements_by_xpath(xpath_selectors.get('likes_selector'))
            # break if no more likes
            if len(all_likes) <= count:
                break

            for like in all_likes[count:]:
                if self.stop_request:
                    return count

                count += 1
                callback(like.text, like.get_attribute('href'), count)

            self.scroll_to_bottom(wait=True)

        return count
示例#3
0
 def crawl_albums(self, targeturl, callback):
     # scrape all albums
     self.load(join_url(targeturl, page_references.get('albums')))
     albums = self.driver.find_elements_by_css_selector(css_selectors.get('indiv_albums'))
     count = 0
     for album_name, album_url in [(a.text, a.get_attribute('href')) for a in albums]:
         if self.stop_request:
             return count
         count += 1
         callback(album_name, album_url, count)
     return count
示例#4
0
 def crawl_about(self, targeturl, callback):
     self.load(join_url(targeturl, page_references.get('about_page')))
     about_links = self.driver.find_elements_by_css_selector(css_selectors.get('about_links'))
     count = 0
     for l in about_links:
         if self.stop_request:
             return count
         l.click()
         self.delay()
         title = l.get_attribute('title')
         main_pane = self.driver.find_element_by_css_selector(css_selectors.get('about_main'))
         callback(title, main_pane.text)
         count += 1
     return count
示例#5
0
    def crawl_checkins(self, targeturl, callback):
        """Callback format: check_in_name, check_in_url, count
        """
        self.load(join_url(targeturl, page_references.get('checkins')))
        count = 0
        while True:
            # get groups, break if no more groups
            checkins = self.driver.find_elements_by_css_selector(css_selectors.get('checkins'))
            if len(checkins) <= count:
                break

            # extract check in info
            for p in checkins:
                if self.stop_request:
                    return count

                count += 1
                callback(p.text, p.get_attribute('href'), count)

            self.scroll_to_bottom(wait=True)

        return count
示例#6
0
    def crawl_groups(self, targeturl, callback):
        """Callback format: group_name, group_url, count
        """
        self.load(join_url(targeturl, page_references.get('groups_page')))
        count = 0
        while True:
            # get groups, break if no more groups
            groups = self.driver.find_elements_by_css_selector(css_selectors.get('groups'))
            if len(groups) <= count:
                break

            # extract group info
            for g in groups:
                if self.stop_request:
                    return count

                count += 1
                callback(g.text, g.get_attribute('href'), count)

            self.scroll_to_bottom(wait=True)

        return count
示例#7
0
    def crawl_photos(self, targeturl, callback):
        """Callback format: photo_source_url, photo_description, photo_post_permalink, count

        The description is not actually the description that was posted with the photo but an
        automatically generated description by Facebook's algorithms. It can detect objects that
        might be contained within the image for example: "trees, person, smiling" could be a description.
        """
        albumurl = join_url(targeturl, page_references.get('photos_page'))
        self.load(albumurl)

        count = 0
        while True:
            all_photos = self.driver.find_elements_by_css_selector(css_selectors.get('photo_selector'))
            # break if no more photos
            if len(all_photos) <= count:
                break

            for p in all_photos[count:]:
                if self.stop_request:
                    return count

                # url of the image
                photo_source_url = p.get_attribute('data-starred-src')

                # get the metadata and store that too
                link = p.find_element_by_css_selector('a')
                photo_description = link.get_attribute('aria-label')
                # get the permalink url of the image post
                photo_post_permalink = link.get_attribute('href')

                count += 1
                callback(photo_source_url, photo_description, photo_post_permalink, count)

            self.scroll_to_bottom(wait=True)

        return count