def cleanMessage(message): # Remove @mentions. message = re.sub("@[a-zA-Z0-9]+", "", message) # Remove links. links = socialUrlUtils.urlsInText(message) # links.reverse() for link in links: message = message.replace(link, "") # Return. return message.strip()
def cleanMessage(message): # Remove Twitter-specific syntax. message = message.replace('@','') message = message.replace('#','') # Remove links. links = socialUrlUtils.urlsInText(message) # links.reverse() for link in links: message = message.replace(link,'') # Return. return message.strip()
def associatedUrls(network,post): if network == 'facebook': links = [] if 'link' in post: links += [post['link']] if 'message' in post: links += socialUrlUtils.urlsInText(post['message']) return links elif network == 'twitter': return [u['expanded_url'] for u in post['entities']['urls']] return []
# Get all relevant Facebook information. service = 'facebook' posts = json.load(open(dirPath_consolidatedFb + [f for f in os.listdir(dirPath_consolidatedFb) if f.startswith(org)][-1])) bitly = json.load(open((dirPath_bitly % service) + org + '.json')) sentiment = json.load(open((dirPath_sentiment % service) + org + '.json')) urls = json.load(open((dirPath_urls % service) + org + '.json')) # Traverse posts. for post in posts: # Get some initial information. expandedUrls = [] if 'link' in post: expandedUrls.append(post['link']) if 'message' in post: expandedUrls += socialUrlUtils.urlsInText(post['message']) created_utc = dp(post['created_time']) # Ensure date in proper range. We want to exclude everything outside of it. if created_utc < fbStartDate or created_utc >= fbEndDate: continue # Container to store stats that will eventually become a data frame. thisPost = { 'service': service, 'id': post['id']} # Post features. thisPost['text'] = post.get('message') thisPost['medium'] = fbStatusTypeToMedium.get(post.get('status_type')) or 'other' # Post datetime features. thisPost['date_utc'] = str(created_utc.date()) thisPost['day_of_week_utc'] = created_utc.weekday() thisPost['weekend_utc'] = thisPost['day_of_week_utc'] >= 5
# Get posts (for URLs). postFilename = [f for f in os.listdir(dirPath_consolidated % network) if f.startswith(org)][-1] posts = json.load(open((dirPath_consolidated % network) + postFilename)) # Traverse posts and find URLs. urls = set() if network == 'facebook': for p in posts: # Verify date. if dateutil.parser.parse(p['created_time']) < lastDate: # Find URLs to add. if 'link' in p: urls.add(p['link']) if 'message' in p: for url in socialUrlUtils.urlsInText(p['message']): urls.add(url) else: recentPostsSkipped += 1 elif network == 'twitter': for p in posts: # Verify date. if dateutil.parser.parse(p['created_at']) < lastDate: # Find URLs to add. for url in p['entities']['urls']: urls.add(url['expanded_url']) else: recentPostsSkipped += 1 # Traverse all URLs. # enumerate() pattern is for debugging. It allows us to break after a certain number of URLs if necessary.