def __init__(self, populate_for_first_four_weeks): self.blog_post_count_map = unpickle( 'blog_post_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.blog_author__post_count_map = unpickle( 'blog_author__post_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.user_likes_map = unpickle( 'user_likes_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.user_liked_blogs_map = unpickle( 'user_liked_blogs_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.blog_liked_users_map = unpickle( 'blog_liked_users_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.tag_blog_count_map = unpickle( 'tag_blog_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.category_blog_count_map = unpickle( 'category_blog_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.tag_user_count_map = unpickle( 'tag_user_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.category_user_count_map = unpickle( 'category_user_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.blog_topic_distribution_map = unpickle( 'blog_topic_distribution_map')
def __init__(self, populate_for_first_four_weeks): self.blog_post_count_map = unpickle('blog_post_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.blog_author__post_count_map = unpickle('blog_author__post_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.user_likes_map = unpickle('user_likes_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.user_liked_blogs_map = unpickle('user_liked_blogs_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.blog_liked_users_map = unpickle('blog_liked_users_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.tag_blog_count_map = unpickle('tag_blog_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.category_blog_count_map = unpickle('category_blog_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.tag_user_count_map = unpickle('tag_user_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.category_user_count_map = unpickle('category_user_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) self.blog_topic_distribution_map = unpickle('blog_topic_distribution_map')
def populate_data_structures(populate_for_first_four_weeks): #populate data structures required for building training CSV logger.info('started populating data structures...') #key = blog #value = number of posts blog_post_count_map = defaultdict(int) #key = blog_author #value = number of posts blog_author__post_count_map = defaultdict(int) #key = uid (user id) #value = list of (blog, post_id, author) tuples user_likes_map = defaultdict(list) #key = uid (user id) #value = set of blogs liked by this user user_liked_blogs_map = defaultdict(set) #key = blog #value = set of users (uid) who have liked at least one post from the blog blog_liked_users_map = defaultdict(set) #key = tag #value = dict: # key = blog # value = count [# of posts in this blog for this tag] #tag_blog_count_map = defaultdict(lambda : defaultdict(int)) tag_blog_count_map = {} #key = category #value = dict: # key = blog # value = count [# of posts in this blog for this category] #category_blog_count_map = defaultdict(lambda : defaultdict(int)) category_blog_count_map = {} #key = tag #value = dict: # key = user [uid] # value = count [# of posts for this tag that this user has liked] #tag_user_count_map = defaultdict(lambda : defaultdict(int)) tag_user_count_map = {} #key = category #value = dict: # key = user [uid] # value = count [# of posts for this category that this user has liked] #category_user_count_map = defaultdict(lambda : defaultdict(int)) category_user_count_map = {} with open(trainPosts_loc, 'r') as f: for line_number, line_text in enumerate(f): if lines_to_read != -1 and line_number >= lines_to_read: break blog_json = json.loads(line_text) blog = blog_json['blog'] post_id = blog_json['post_id'] author = blog_json['author'] tags = blog_json['tags'] categories = blog_json['categories'] date_struct = datetime.strptime(blog_json['date_gmt'], '%Y-%m-%d %H:%M:%S') date_string = date_struct.strftime('%Y-%m-%d') if populate_for_first_four_weeks and date_string >= wk_5_start_date: continue blog_post_count_map[blog] += 1 blog_author__post_count_map[blog + '_' + author] += 1 for tag in tags: #tag_blog_count_map[tag][blog] += 1 increment_by_one(tag_blog_count_map, tag, blog) for category in categories: #category_blog_count_map[category][blog] += 1 increment_by_one(category_blog_count_map, category, blog) for like in blog_json['likes']: uid = like['uid'] user_likes_map[uid] += [(blog, post_id, author)] user_liked_blogs_map[uid].add(blog) blog_liked_users_map[blog].add(uid) for tag in tags: #tag_user_count_map[tag][uid] += 1 increment_by_one(tag_user_count_map, tag, uid) for category in categories: #category_user_count_map[category][uid] += 1 increment_by_one(category_user_count_map, category, uid) logger.info('finished populating data structures') pickle(blog_post_count_map, 'blog_post_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(blog_author__post_count_map, 'blog_author__post_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(user_likes_map, 'user_likes_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(user_liked_blogs_map, 'user_liked_blogs_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(blog_liked_users_map, 'blog_liked_users_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(tag_blog_count_map, 'tag_blog_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(category_blog_count_map, 'category_blog_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(tag_user_count_map, 'tag_user_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(category_user_count_map, 'category_user_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks))