def scrape_items(self): """ Scrape tweet information """ items = [] for element in self.browser.find_elements_by_class_name( "Timeline-item"): item = {} item["screen_name"] = self.user item["poster_display_name"] = element.find_element_by_class_name( "UserNames-displayName").text item["poster_screen_name"] = element.find_element_by_class_name( "UserNames-screenName").text.strip("@") item["text"] = element.find_element_by_class_name( "TweetText").text #.encode('utf8').decode('utf8') item["text_html"] = element.find_element_by_class_name( "TweetText").get_attribute( 'innerHTML') #.encode('utf8').decode('utf8') item["id"] = element.find_element_by_class_name( "Tweet").get_attribute('data-tweet-id') try: item["inline_media"] = element.find_element_by_class_name( "InlineMedia-content").text item["inline_media"] = True except: item["inline_media"] = False try: item["retweets"] = element.find_element_by_css_selector( "button[jsaction='click:retweet']" ).find_element_by_class_name("TweetAction-count").text item["retweets"] = helper.prep_number(item["retweets"]) except: item["retweets"] = 0 try: item["favorites"] = element.find_element_by_css_selector( "button[jsaction='click:heart']" ).find_element_by_class_name("TweetAction-count").text item["favorites"] = helper.prep_number(item["favorites"]) except: item["favorites"] = 0 item["posted_date"] = element.find_element_by_class_name( "Tweet-timestamp").find_element_by_tag_name( "time").get_attribute("datetime") item["posted_date"] = item["posted_date"].split("+")[0] item["posted_date"] = datetime.datetime.strptime( item["posted_date"], '%Y-%m-%dT%H:%M:%S') items.append(item) self.data = items
def scrape_data(self): """ Scrape user data """ item = {} item["display_name"] = self.browser.find_element_by_class_name( "UserProfileHeader-displayName").text.split("\n")[0] item["screen_name"] = self.browser.find_element_by_class_name( "UserProfileHeader-screenName").text.strip("@") item["avatar"] = self.browser.find_element_by_class_name( "UserAvatar").get_attribute("src") try: item["bio"] = self.browser.find_element_by_class_name( "UserProfileHeader-bio").text.replace("\n", " ") except: item["bio"] = None try: item["website"] = self.browser.find_element_by_class_name( "UserProfileHeader-url").get_attribute("href") except: item["website"] = None try: item["details_html"] = self.browser.find_element_by_class_name( "UserProfileHeader-details").get_attribute('innerHTML') except: item["details_html"] = None item["followers_nr"] = self.browser.find_element_by_class_name( "UserProfileHeader-stat--followers").find_element_by_class_name( "UserProfileHeader-statCount").text item["followers_nr"] = int(helper.prep_number(item["followers_nr"])) item["following_nr"] = self.browser.find_element_by_class_name( "UserProfileHeader-stat--following").find_element_by_class_name( "UserProfileHeader-statCount").text item["following_nr"] = int(helper.prep_number(item["following_nr"])) try: item["last_active"] = self.browser.find_elements_by_class_name( "Tweet-timestamp")[0].find_element_by_tag_name( "time").get_attribute("datetime") item["last_active"] = item["last_active"].split("+")[0] item["last_active"] = datetime.strptime(item["last_active"], '%Y-%m-%dT%H.%M.%S') except: item["last_active"] = None self.data = item
def scrape_items(self): """ Scrape tweet information """ items=[] for element in self.browser.find_elements_by_class_name("Timeline-item"): item={} item["screen_name"] = self.user item["poster_display_name"] = element.find_element_by_class_name("UserNames-displayName").text item["poster_screen_name"] = element.find_element_by_class_name("UserNames-screenName").text.strip("@") item["text"] = element.find_element_by_class_name("TweetText").text#.encode('utf8').decode('utf8') item["text_html"] = element.find_element_by_class_name("TweetText").get_attribute('innerHTML')#.encode('utf8').decode('utf8') item["id"] = element.find_element_by_class_name("Tweet").get_attribute('data-tweet-id') try: item["inline_media"] = element.find_element_by_class_name("InlineMedia-content").text item["inline_media"] = True except: item["inline_media"] = False try: item["retweets"] = element.find_element_by_css_selector("button[jsaction='click:retweet']").find_element_by_class_name("TweetAction-count").text item["retweets"] = helper.prep_number(item["retweets"]) except: item["retweets"] = 0 try: item["favorites"] = element.find_element_by_css_selector("button[jsaction='click:heart']").find_element_by_class_name("TweetAction-count").text item["favorites"] = helper.prep_number(item["favorites"]) except: item["favorites"] = 0 item["posted_date"] = element.find_element_by_class_name("Tweet-timestamp").find_element_by_tag_name("time").get_attribute("datetime") item["posted_date"]=item["posted_date"].split("+")[0] item["posted_date"] = datetime.datetime.strptime(item["posted_date"], '%Y-%m-%dT%H:%M:%S') items.append(item) self.data = items
def scrape_data(self): """ Scrape user data """ item = {} item["display_name"] = self.browser.find_element_by_class_name("UserProfileHeader-displayName").text.split("\n")[0] item["screen_name"] = self.browser.find_element_by_class_name("UserProfileHeader-screenName").text.strip("@") item["avatar"] = self.browser.find_element_by_class_name("UserAvatar").get_attribute("src") try: item["bio"] = self.browser.find_element_by_class_name("UserProfileHeader-bio").text.replace("\n"," ") except: item["bio"] = None try: item["website"] = self.browser.find_element_by_class_name("UserProfileHeader-url").get_attribute("href") except: item["website"] = None try: item["details_html"] = self.browser.find_element_by_class_name("UserProfileHeader-details").get_attribute('innerHTML') except: item["details_html"] = None item["followers_nr"] = self.browser.find_element_by_class_name("UserProfileHeader-stat--followers").find_element_by_class_name("UserProfileHeader-statCount").text item["followers_nr"] = int(helper.prep_number(item["followers_nr"])) item["following_nr"] = self.browser.find_element_by_class_name("UserProfileHeader-stat--following").find_element_by_class_name("UserProfileHeader-statCount").text item["following_nr"] = int(helper.prep_number(item["following_nr"])) try: item["last_active"] = self.browser.find_elements_by_class_name("Tweet-timestamp")[0].find_element_by_tag_name("time").get_attribute("datetime") item["last_active"]=item["last_active"].split("+")[0] item["last_active"] = datetime.strptime(item["last_active"], '%Y-%m-%dT%H.%M.%S') except: item["last_active"] = None self.data = item