def message_clean(self): try: return self._message_clean except AttributeError: self._message_clean, urls = remove_urls(self.message) # Add the URLs found to `self.urls` #self.urls = urls return self._message_clean
def _extract_urls_and_media(self): indices = list() # list of indices to remove self._text_clean = self.text # Get indices from urls urls = self._entities.get('urls', ()) for url in urls: #self.urls.append(url.get('expanded_url', ())) i = url.get('indices', ()) indices.append((i[0], i[1])) # Get indices from media media = self._entities.get('media', ()) for medium in media: #self.media.append(medium.get('expanded_url', ())) i = medium.get('indices', ()) indices.append((i[0], i[1])) # Now `indices` is a list of tuples, where each tuple is (x, y) where x and y are the # start and end of the url to remove from `self.text`. # We must order `indices` based on the starting indices, reverse. # Then we can remove the urls from `self.text`. indices = sorted(indices, key=lambda x: x[1], reverse=True) for ix in indices: self._text_clean = self._text_clean[:ix[0]] + self._text_clean[ix[1]:] # Oldest tweets have no `urls` list in `entities` dictionary, they are just hard coded # in `self.text`. We must use a regex to remove them. # How do we know if this is an old tweet? We assume that any tweet with no `indices` is # an old-style tweet. # Also notice that if a tweet ends with the ellipsis character (u'\u2026'), or with # '...' for old-style tweets, this means that it is a retweet and it has been truncated. # If it is this case and if there a URL at the end of the status and this URL has # been truncated, then the `indices` contains only the info to cut out the ellipsis # character, like (139, 140), and not the url. So we need to process also this case. if not indices or self.text[-1:] == u'\u2026' or self.text[-3:] == '...': # Remove all URLs from `_text_clean` self._text_clean, urls = remove_urls(self._text_clean) # Add the URLs found to `self.urls` #self.urls.extend(urls) # Clean out any sort of 'http:/ ..' left at the end of the retweet like in: # "RT @googlemaps: Thx to the previewers who helped us build the #newGoogleMaps. # Beginning today it rolls out to users around the world http:/<ellipsis>" regex = u'\s*ht(t|tps?|tps?:|tps?:/|tps?://)?\s*(\.|\u2026)*$' self._text_clean = re.sub(regex, '', self._text_clean.strip()).strip()