예제 #1
0
    def scrape_items(self):
        """
            Scrape tweet information
        """
        items = []
        for element in self.browser.find_elements_by_class_name(
                "Timeline-item"):
            item = {}
            item["screen_name"] = self.user
            item["poster_display_name"] = element.find_element_by_class_name(
                "UserNames-displayName").text
            item["poster_screen_name"] = element.find_element_by_class_name(
                "UserNames-screenName").text.strip("@")
            item["text"] = element.find_element_by_class_name(
                "TweetText").text  #.encode('utf8').decode('utf8')
            item["text_html"] = element.find_element_by_class_name(
                "TweetText").get_attribute(
                    'innerHTML')  #.encode('utf8').decode('utf8')
            item["id"] = element.find_element_by_class_name(
                "Tweet").get_attribute('data-tweet-id')

            try:
                item["inline_media"] = element.find_element_by_class_name(
                    "InlineMedia-content").text
                item["inline_media"] = True
            except:
                item["inline_media"] = False
            try:
                item["retweets"] = element.find_element_by_css_selector(
                    "button[jsaction='click:retweet']"
                ).find_element_by_class_name("TweetAction-count").text
                item["retweets"] = helper.prep_number(item["retweets"])
            except:
                item["retweets"] = 0
            try:
                item["favorites"] = element.find_element_by_css_selector(
                    "button[jsaction='click:heart']"
                ).find_element_by_class_name("TweetAction-count").text
                item["favorites"] = helper.prep_number(item["favorites"])
            except:
                item["favorites"] = 0

            item["posted_date"] = element.find_element_by_class_name(
                "Tweet-timestamp").find_element_by_tag_name(
                    "time").get_attribute("datetime")
            item["posted_date"] = item["posted_date"].split("+")[0]
            item["posted_date"] = datetime.datetime.strptime(
                item["posted_date"], '%Y-%m-%dT%H:%M:%S')

            items.append(item)
        self.data = items
예제 #2
0
    def scrape_data(self):
        """
            Scrape user data
        """
        item = {}
        item["display_name"] = self.browser.find_element_by_class_name(
            "UserProfileHeader-displayName").text.split("\n")[0]
        item["screen_name"] = self.browser.find_element_by_class_name(
            "UserProfileHeader-screenName").text.strip("@")
        item["avatar"] = self.browser.find_element_by_class_name(
            "UserAvatar").get_attribute("src")

        try:
            item["bio"] = self.browser.find_element_by_class_name(
                "UserProfileHeader-bio").text.replace("\n", " ")
        except:
            item["bio"] = None
        try:
            item["website"] = self.browser.find_element_by_class_name(
                "UserProfileHeader-url").get_attribute("href")
        except:
            item["website"] = None
        try:
            item["details_html"] = self.browser.find_element_by_class_name(
                "UserProfileHeader-details").get_attribute('innerHTML')
        except:
            item["details_html"] = None

        item["followers_nr"] = self.browser.find_element_by_class_name(
            "UserProfileHeader-stat--followers").find_element_by_class_name(
                "UserProfileHeader-statCount").text
        item["followers_nr"] = int(helper.prep_number(item["followers_nr"]))

        item["following_nr"] = self.browser.find_element_by_class_name(
            "UserProfileHeader-stat--following").find_element_by_class_name(
                "UserProfileHeader-statCount").text
        item["following_nr"] = int(helper.prep_number(item["following_nr"]))

        try:
            item["last_active"] = self.browser.find_elements_by_class_name(
                "Tweet-timestamp")[0].find_element_by_tag_name(
                    "time").get_attribute("datetime")
            item["last_active"] = item["last_active"].split("+")[0]
            item["last_active"] = datetime.strptime(item["last_active"],
                                                    '%Y-%m-%dT%H.%M.%S')
        except:
            item["last_active"] = None

        self.data = item
예제 #3
0
 def scrape_items(self):
     """
         Scrape tweet information
     """
     items=[]
     for element in self.browser.find_elements_by_class_name("Timeline-item"):
         item={}
         item["screen_name"] = self.user            
         item["poster_display_name"] = element.find_element_by_class_name("UserNames-displayName").text
         item["poster_screen_name"] = element.find_element_by_class_name("UserNames-screenName").text.strip("@")
         item["text"] = element.find_element_by_class_name("TweetText").text#.encode('utf8').decode('utf8')
         item["text_html"] = element.find_element_by_class_name("TweetText").get_attribute('innerHTML')#.encode('utf8').decode('utf8')
         item["id"] = element.find_element_by_class_name("Tweet").get_attribute('data-tweet-id')
         
         
         try:
             item["inline_media"] = element.find_element_by_class_name("InlineMedia-content").text
             item["inline_media"] = True
         except:
             item["inline_media"] = False
         try:
             item["retweets"] = element.find_element_by_css_selector("button[jsaction='click:retweet']").find_element_by_class_name("TweetAction-count").text
             item["retweets"] = helper.prep_number(item["retweets"])
         except:
             item["retweets"] = 0
         try:
             item["favorites"] = element.find_element_by_css_selector("button[jsaction='click:heart']").find_element_by_class_name("TweetAction-count").text
             item["favorites"] = helper.prep_number(item["favorites"])
         except:
             item["favorites"] = 0
             
         
         item["posted_date"] = element.find_element_by_class_name("Tweet-timestamp").find_element_by_tag_name("time").get_attribute("datetime")
         item["posted_date"]=item["posted_date"].split("+")[0]
         item["posted_date"] = datetime.datetime.strptime(item["posted_date"], '%Y-%m-%dT%H:%M:%S')
         
         
         
         items.append(item)
     self.data = items
예제 #4
0
 def scrape_data(self):
     """
         Scrape user data
     """
     item = {}
     item["display_name"] = self.browser.find_element_by_class_name("UserProfileHeader-displayName").text.split("\n")[0]
     item["screen_name"] = self.browser.find_element_by_class_name("UserProfileHeader-screenName").text.strip("@")
     item["avatar"] = self.browser.find_element_by_class_name("UserAvatar").get_attribute("src")
     
     try:
         item["bio"] = self.browser.find_element_by_class_name("UserProfileHeader-bio").text.replace("\n"," ")
     except:
         item["bio"] = None
     try:
         item["website"] = self.browser.find_element_by_class_name("UserProfileHeader-url").get_attribute("href")
     except:
         item["website"] = None
     try:
         item["details_html"] = self.browser.find_element_by_class_name("UserProfileHeader-details").get_attribute('innerHTML')
     except:
         item["details_html"] = None
         
     item["followers_nr"] = self.browser.find_element_by_class_name("UserProfileHeader-stat--followers").find_element_by_class_name("UserProfileHeader-statCount").text
     item["followers_nr"] = int(helper.prep_number(item["followers_nr"]))
     
     item["following_nr"] = self.browser.find_element_by_class_name("UserProfileHeader-stat--following").find_element_by_class_name("UserProfileHeader-statCount").text
     item["following_nr"] = int(helper.prep_number(item["following_nr"]))
     
     
     try:
         item["last_active"] = self.browser.find_elements_by_class_name("Tweet-timestamp")[0].find_element_by_tag_name("time").get_attribute("datetime")
         item["last_active"]=item["last_active"].split("+")[0]
         item["last_active"] = datetime.strptime(item["last_active"], '%Y-%m-%dT%H.%M.%S')
     except:
         item["last_active"] = None
     
     self.data = item