def find_posts(token, user, graph_api_edge, min_date=None, max_date=None, retrieved_time=datetime.utcnow()): """Retrieve a Post resources from the Graph API. Args: graph_api_edge (str): The URL of the /feed edge within the Graph API min_date (datetime): The minimum date after which the Posts are published max_date (datetime): The maximum date before which the Posts are published Returns: list. A list of Post resources. """ # Retrieve the posts # Attempt to retrieve the serialized Posts first post_objects = Post.objects(user=user) if len(post_objects) > 0: posts = post_objects else: posts = [] # posts_response = facebook.get(graph_api_edge) posts_response = graph_api_get(token, graph_api_edge) posts_data = posts_response.data if 'data' in posts_data: for post_data in posts_data['data']: # Format the name of the place within the post if 'place' in post_data: post_data['place_name'] = post_data['place']['name'] else: post_data['place_name'] = '' # Create the user link within the post if 'from' in post_data: post_data['user_url'] = '#' else: post_data['user_url'] = '' if 'story' in post_data: post_data['story'] = post_data['story'].encode('ascii', 'ignore') else: post_data['story'] = '' if 'message' in post_data: post_data['message'] = post_data['message'].encode('ascii', 'ignore') else: post_data['message'] = '' # post_data['likes'] = post_data['likes'] if 'likes' in post_data else 0 if 'likes' in post_data: post_data['likes'] = len(post_data['likes']['data']) else: post_data['likes'] = 0 post_data['link'] = post_data['link'] if 'link' in post_data else '' post_data['created_time_formatted'] = datetime.strptime(post_data['created_time'], '%Y-%m-%dT%H:%M:%S+0000') # Retrieve the Posts post_objects = Post.objects(facebook_id=post_data['id']) # post_objects = Post.objects(retrieved_time=retrieved_time) user = find_user(token, post_data['from']['id'], retrieved_time=retrieved_time) place = None if len(post_objects) > 1: raise Exception("More than one Post serialized for %s" % post_data['id']) elif len(post_objects) < 1: post_object = Post(facebook_id=post_data['id'], message=post_data['message'], story=post_data['story'], created_time_formatted=post_data['created_time_formatted'], place_name=post_data['place_name'], place=place, likes=post_data['likes'], link=post_data['link'], retrieved_time=retrieved_time, user=user) post_object.save() else: post_object = post_objects.first() post_object.facebook_id = post_object.facebook_id post_object.message = post_object.message post_object.story = post_object.story post_object.created_time_formatted = post_object.created_time_formatted post_object.place_name = post_object.place_name post_object.likes = post_object.likes post_object.link = post_object.link post_object.user = user post_object.save() if min_date is None and max_date is None: posts.append(post_object) elif min_date and max_date: if post_object.created_time_formatted >= min_date and post_object.created_time_formatted <= max_date: posts.append(post_object) else: raise Exception("Failed to parse the date range between %s and %s", (min_date.strftime('%Y-%m-%dT%H:%M:%S+0000'), max_date.strftime('%Y-%m-%dT%H:%M:%S+0000') ) ) # BaseQuerySets cannot be pickled post_objects = [] for post in posts: post_objects.append(post) return post_objects
def download(): """Download an export of Graph API data for any given User """ downloaded_resources = [] users = [] posts = [] friendships = [] publishings = [] comments = [] # Retrieve all shares left for posts by friends sharedposts = [] # Retrieve all users who liked the post by this friend likes = [] # Retrieve all reactions to the post by this friend reactions = [] user = None token = get_facebook_oauth_token() if token: user = facebook.get('/me') if not is_admin(user.data['id']): return redirect(url_for('index')) min_date_str = request.args.get('min_date', None) max_date_str = request.args.get('max_date', None) # Filter for posts if min_date_str is not None and max_date_str is not None: min_date_str = re.sub(r'\.\d{3}Z$', '', min_date_str) max_date_str = re.sub(r'\.\d{3}Z$', '', max_date_str) min_date = datetime.strptime(min_date_str, '%Y-%m-%dT%H:%M:%S') max_date = datetime.strptime(max_date_str, '%Y-%m-%dT%H:%M:%S') else: min_date = datetime(1, 1, 1) max_date = datetime.utcnow() graph_api_user = request.args.get('user', None) if graph_api_user is not None: # Start to explore the Graph # graph_users = [ find_user(token, graph_api_user, retrieved_time=retrieved_time) ] graph_users = User.objects(facebook_id=user.data['id']) # graph_users = [] # for graph_user in User.objects(facebook_id=user.data['id']): # graph_users.append(graph_user) else: # Retrieve serialized User resources graph_users = User.objects() # graph_users = [] # for graph_user in User.objects(): # graph_users.append(graph_user) post_users = {} for this_user in graph_users: if not this_user.facebook_id in downloaded_resources: users.append(this_user) downloaded_resources.append(this_user.facebook_id) # Retrieve the friends friends = this_user.friends for friend in friends: if not friend.facebook_id in downloaded_resources: # Add the friend as a user users.append(friend) downloaded_resources.append(friend.facebook_id) # Add the friendship friendships.append({'User A ID': this_user.facebook_id, 'User B ID': friend.facebook_id}) # Retrieve the posts # user_posts = find_posts(token, this_user, '/' + str(this_user.facebook_id) + '/feed', min_date, max_date, retrieved_time) user_posts = Post.objects(user=this_user) # user_posts = [] # for user_post in Post.objects(user=user): # user_posts.append(user_post) for post in user_posts: if not post.facebook_id in downloaded_resources: posts.append(post) downloaded_resources.append(post.facebook_id) if post.user is not None: post_users[post.user.facebook_id] = post.user # Add the publishings publishings.append({ 'User ID': post.user.facebook_id, 'Post ID': post.facebook_id }) # Graph API version 2.6 or later required post_comments = find_comments(token, post, min_date, max_date) comments.extend(post_comments) post_sharedposts = find_sharedposts(post, min_date, max_date) # Temporarily concatenate all comments and shares comments.extend(sharedposts) else: return redirect(url_for('index')) workbook = xlsxwriter.Workbook('tmp/socialmemories.xlsx') users_worksheet = workbook.add_worksheet('Users') posts_worksheet = workbook.add_worksheet('Posts') friendships_worksheet = workbook.add_worksheet('Friendships') publishings_worksheet = workbook.add_worksheet('Publishings') comments_worksheet = workbook.add_worksheet('Comments') denorm_posts_worksheet = workbook.add_worksheet('Denormalized Posts') # Create the classifier # Naive Bayes is performant (if not that accurate) human_names = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')] features = [({'name': name}, gender) for (name, gender) in human_names] training_set = features[500:] test_set = features[:500] classifier = NaiveBayesClassifier.train(training_set) export_users(csv, users, users_worksheet, classifier) export_posts(csv, posts, posts_worksheet, denorm_posts_worksheet, post_users, classifier) export_comments(csv, comments, comments_worksheet) export_friendships(csv, friendships, friendships_worksheet) export_publishings(csv, publishings, publishings_worksheet) workbook.close() return send_file('tmp/socialmemories.xlsx', as_attachment=True)