Пример #1
0
    def crawl_friends(self, targeturl, callback):
        count = 0
        # load the friends page
        friendsurl = join_url(targeturl, page_references.get('friends_page'))
        self.load(friendsurl)

        while True:
            all_friends = self.driver.find_elements_by_css_selector(css_selectors.get('friends_selector'))
            # break if no more friends
            if len(all_friends) <= count:
                break

            for friend in all_friends[count:]:
                if self.stop_request:
                    return count
                count += 1
                friend_info = friend.find_element_by_xpath(xpath_selectors.get('friend_info'))
                name = friend_info.text
                url = friend_info.get_attribute('href')
                imgurl = friend.find_element_by_css_selector(css_selectors.get('friend_image')).get_attribute('src')
                callback(name, url, imgurl, count)

            self.scroll_to_bottom(wait=True)

        return count
Пример #2
0
 def login(self, user, password):
     try:
         self.load('https://www.facebook.com/login.php')
         self.js("document.querySelector('{}').value = '{}';".format(css_selectors.get('email_field'), user))
         self.js("document.querySelector('{}').value = '{}';".format(css_selectors.get('password_field'), password))
         self.js("document.querySelector('{}').submit();".format(css_selectors.get('login_form')))
         self.delay()
         return 'login' not in self.driver.current_url
     except WebDriverException:
         log.error('Couldn\'t load page. Are you connected to the internet?')
         return False
Пример #3
0
    def _grab_post_content(self, p, with_translation=True):
        translation = ''

        # if there's original content grab it
        # sometimes facebook automatically shows the translated text and not
        # the original.
        try:
            so = p.find_element_by_link_text(text_content['see_original'])
            # split the post text into the original post and translation
            parts = so.find_elements_by_xpath(xpath_selectors.get('trans_splitter'))
            translation = parts[1].text
            if self.force_click(p, so):
                log.info('Found and clicked "See original"')
            # remove the translation from the post text
            post_text = p.text.replace(text_content['hide_original'], text_content['see_original'])
            post_text = post_text.replace(translation, '')
            if with_translation:
                return post_text, translation
            else:
                return post_text, ''
        except (NoSuchElementException):
            pass

        # grab the text normally
        post_text = p.text

        # expand the see more links if it exists
        try:
            # clicking the see more link can be quite unpredictable
            # sometimes it takes a while for it to load andetimes clicking it seems
            # to do nothing. so wrap it in a while loop.
            sm = p.find_element_by_css_selector(css_selectors.get('see_more'))
            if self.force_click(p, sm):
                log.info('Found and expanded "See more"')
                post_text = p.text
        except (NoSuchElementException):
            pass

        # grab the translation if it exists and is needed
        if with_translation:
            try:
                st = p.find_element_by_link_text(text_content.get('see_translation_text'))
                st_parent = st.find_element_by_xpath('../..')
                if self.force_click(p, st):
                    log.info('Found post translation')
                    translation = st_parent.find_element_by_css_selector(css_selectors.get('translation')).text
            except (NoSuchElementException):
                pass


        return post_text, translation
Пример #4
0
 def crawl_about(self, targeturl, callback):
     self.load(join_url(targeturl, page_references.get('about_page')))
     about_links = self.driver.find_elements_by_css_selector(css_selectors.get('about_links'))
     count = 0
     for l in about_links:
         if self.stop_request:
             return count
         l.click()
         self.delay()
         title = l.get_attribute('title')
         main_pane = self.driver.find_element_by_css_selector(css_selectors.get('about_main'))
         callback(title, main_pane.text)
         count += 1
     return count
Пример #5
0
    def crawl_one_album(self, albumurl, callback):
        """Callback format: photo_source_url, photo_post_permalink, count
        """

        self.load(albumurl)
        count = 0
        while True:
            all_photos = self.driver.find_elements_by_css_selector(css_selectors.get('album_photo'))
            # break if no more photos
            if len(all_photos) <= count:
                break

            for p in all_photos[count:]:
                if self.stop_request:
                    return count

                # url of the image
                try:
                    img = p.find_element_by_css_selector('img')
                except NoSuchElementException:
                    # this is probably a video in the video folder
                    img = p.find_element_by_css_selector('span > div')

                photo_source_url = self.get_bg_img_url(img)
                photo_post_permalink = p.get_attribute('ajaxify')
                count += 1
                callback(photo_source_url, photo_post_permalink, count)

            self.scroll_to_bottom(wait=True)

        return count
Пример #6
0
 def is_valid_target(self, targeturl):
     """Returns True if <targeturl> is a valid profile.
     Does this by checking if a certain error message text is contained inside the page body.
     """
     try:
         self.load(targeturl)
         header_text = self.driver.find_element_by_css_selector(css_selectors.get('error_header')).text
         return text_content.get('error_header_text').lower() not in header_text.lower()
     except NoSuchElementException:
         return True
Пример #7
0
    def crawl_event_guests(self, url, callback, guest_filter=None):
        if guest_filter is None:
            guest_filter = ['interested', 'going', 'invited']
        self.load(url, scroll=False)
        total = 0
        # open guests list
        guest_list = self.driver.find_element_by_css_selector(css_selectors.get('event_guests'))
        guest_list.click()
        self.delay(1.5)
        buttons = self.driver.find_elements_by_css_selector(css_selectors.get('guest_buttons'))
        dialog = buttons[0].find_element_by_xpath('../../..')
        for b in buttons:
            # check to see if we want to scrape these guests
            label = b.text.strip().split(' ')[0].lower()
            if label not in guest_filter:
                continue

            # we want to scrape these guests
            count = 0
            b.click()
            self.delay()
            scroller = dialog.find_element_by_css_selector(css_selectors.get('guest_scroller'))
            while True:
                results = dialog.find_elements_by_css_selector(css_selectors.get('guest_list'))
                if len(results) <= count:
                    break

                for friend in results[count:]:
                    if self.stop_request:
                        return total
                    friend_info = friend.find_element_by_xpath(xpath_selectors.get('event_friend_info'))
                    name = friend_info.text
                    url = friend_info.get_attribute('href')
                    imgurl = friend.find_element_by_css_selector(css_selectors.get('friend_image')).get_attribute('src')
                    count += 1
                    total += 1
                    callback(label, name, url, imgurl, total)

                self.js('a = arguments[0]; a.scrollTo(0, a.scrollHeight);', scroller)
                self.delay()

        return total
Пример #8
0
 def crawl_albums(self, targeturl, callback):
     # scrape all albums
     self.load(join_url(targeturl, page_references.get('albums')))
     albums = self.driver.find_elements_by_css_selector(css_selectors.get('indiv_albums'))
     count = 0
     for album_name, album_url in [(a.text, a.get_attribute('href')) for a in albums]:
         if self.stop_request:
             return count
         count += 1
         callback(album_name, album_url, count)
     return count
Пример #9
0
    def crawl_search_results(self, url, callback, limit=0):
        """Accepts a callback method which has search result's name, url, imageurl,
        as well as the current search result count.
        Limit is the maximum number of results to return. A limit of zero is unlimited.
        """
        self.load(url)
        count = 0
        while True:
            results = self.driver.find_elements_by_css_selector(css_selectors.get('search_results'))
            if len(results) <= count:
                break

            for r in results[count:]:
                if self.stop_request or limit > 0 and count >= limit:
                    return count
                imageurl = r.find_element_by_css_selector(css_selectors.get('search_pics')).get_attribute('src')
                url = r.find_elements_by_css_selector(css_selectors.get('search_link'))[1]
                name = url.text
                count += 1
                callback(name, url.get_attribute('href'), imageurl, count)

            self.scroll_to_bottom(wait=True)

        return count
Пример #10
0
    def crawl_checkins(self, targeturl, callback):
        """Callback format: check_in_name, check_in_url, count
        """
        self.load(join_url(targeturl, page_references.get('checkins')))
        count = 0
        while True:
            # get groups, break if no more groups
            checkins = self.driver.find_elements_by_css_selector(css_selectors.get('checkins'))
            if len(checkins) <= count:
                break

            # extract check in info
            for p in checkins:
                if self.stop_request:
                    return count

                count += 1
                callback(p.text, p.get_attribute('href'), count)

            self.scroll_to_bottom(wait=True)

        return count
Пример #11
0
    def crawl_groups(self, targeturl, callback):
        """Callback format: group_name, group_url, count
        """
        self.load(join_url(targeturl, page_references.get('groups_page')))
        count = 0
        while True:
            # get groups, break if no more groups
            groups = self.driver.find_elements_by_css_selector(css_selectors.get('groups'))
            if len(groups) <= count:
                break

            # extract group info
            for g in groups:
                if self.stop_request:
                    return count

                count += 1
                callback(g.text, g.get_attribute('href'), count)

            self.scroll_to_bottom(wait=True)

        return count
Пример #12
0
    def crawl_photos(self, targeturl, callback):
        """Callback format: photo_source_url, photo_description, photo_post_permalink, count

        The description is not actually the description that was posted with the photo but an
        automatically generated description by Facebook's algorithms. It can detect objects that
        might be contained within the image for example: "trees, person, smiling" could be a description.
        """
        albumurl = join_url(targeturl, page_references.get('photos_page'))
        self.load(albumurl)

        count = 0
        while True:
            all_photos = self.driver.find_elements_by_css_selector(css_selectors.get('photo_selector'))
            # break if no more photos
            if len(all_photos) <= count:
                break

            for p in all_photos[count:]:
                if self.stop_request:
                    return count

                # url of the image
                photo_source_url = p.get_attribute('data-starred-src')

                # get the metadata and store that too
                link = p.find_element_by_css_selector('a')
                photo_description = link.get_attribute('aria-label')
                # get the permalink url of the image post
                photo_post_permalink = link.get_attribute('href')

                count += 1
                callback(photo_source_url, photo_description, photo_post_permalink, count)

            self.scroll_to_bottom(wait=True)

        return count