def populate_data_structures(): logger.info('started populating data structures...') #key = blog #value = list of (post_id, author, tags, categories, date_struct) tuples test_blog_post_tuples_map = defaultdict(list) with open(testPosts_loc, 'r') as f: for line_number, line_text in enumerate(f): if lines_to_read != -1 and line_number >= lines_to_read: break blog_json = json.loads(line_text) blog = blog_json['blog'] post_id = blog_json['post_id'] author = blog_json['author'] tags = blog_json['tags'] categories = blog_json['categories'] date_struct = datetime.strptime(blog_json['date_gmt'], '%Y-%m-%d %H:%M:%S') test_blog_post_tuples_map[blog] += [(post_id, author, tags, categories, date_struct)] logger.info('finished populating data structures') pickle(test_blog_post_tuples_map, 'test_blog_post_tuples_map')
from collections import defaultdict import json import logging from gensim.models.ldamodel import LdaModel from v1.config_and_pickle import testPosts_loc, MyFilesIterator, trainPosts_loc, MyCorpus, build_word_id_map, normalize_content_stats, topic_count, pickle, trainPostsThin_loc ################################################################################################################################################ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger = logging.getLogger('LDA_model_builder') ################################################################################################################################################ logger.info('building word_id_map...') word_id_map = build_word_id_map([trainPosts_loc, testPosts_loc]) pickle(word_id_map, 'word_id_map') normalize_content_stats() train_and_test_corpus = MyCorpus([trainPosts_loc, testPosts_loc], word_id_map) logger.info('training LDA model...') #id2word is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. lda = LdaModel(train_and_test_corpus, id2word=word_id_map, num_topics=topic_count, update_every=1, chunksize=10000, passes=1) pickle(lda, 'lda') #Print the 'topn' most probable words for (randomly selected) 'topics' number of topics. Set topics=-1 to print all topics. lda.show_topics(topics=topic_count, topn=10) ################################################################################################################################################ #key = blog + '_' + post_id
from collections import defaultdict import json import logging from gensim.models.ldamodel import LdaModel from v1.config_and_pickle import testPosts_loc, MyFilesIterator, trainPosts_loc, MyCorpus, build_word_id_map, normalize_content_stats, topic_count, pickle, trainPostsThin_loc ################################################################################################################################################ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger = logging.getLogger('LDA_model_builder') ################################################################################################################################################ logger.info('building word_id_map...') word_id_map = build_word_id_map([trainPosts_loc, testPosts_loc]) pickle(word_id_map, 'word_id_map') normalize_content_stats() train_and_test_corpus = MyCorpus([trainPosts_loc, testPosts_loc], word_id_map) logger.info('training LDA model...') #id2word is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. lda = LdaModel(train_and_test_corpus, id2word=word_id_map, num_topics=topic_count, update_every=1, chunksize=10000, passes=1) pickle(lda, 'lda') #Print the 'topn' most probable words for (randomly selected) 'topics' number of topics. Set topics=-1 to print all topics. lda.show_topics(topics=topic_count, topn=10) ################################################################################################################################################ #key = blog + '_' + post_id #value = a list of (topic_id, topic_probability) 2-tuples blog_topic_distribution_map = {} #key = uid (user id) #value = list of (blog, post_id) tuples train_user_likes_map = defaultdict(list)
def populate_data_structures(populate_for_first_four_weeks): #populate data structures required for building training CSV logger.info('started populating data structures...') #key = blog #value = number of posts blog_post_count_map = defaultdict(int) #key = blog_author #value = number of posts blog_author__post_count_map = defaultdict(int) #key = uid (user id) #value = list of (blog, post_id, author) tuples user_likes_map = defaultdict(list) #key = uid (user id) #value = set of blogs liked by this user user_liked_blogs_map = defaultdict(set) #key = blog #value = set of users (uid) who have liked at least one post from the blog blog_liked_users_map = defaultdict(set) #key = tag #value = dict: # key = blog # value = count [# of posts in this blog for this tag] #tag_blog_count_map = defaultdict(lambda : defaultdict(int)) tag_blog_count_map = {} #key = category #value = dict: # key = blog # value = count [# of posts in this blog for this category] #category_blog_count_map = defaultdict(lambda : defaultdict(int)) category_blog_count_map = {} #key = tag #value = dict: # key = user [uid] # value = count [# of posts for this tag that this user has liked] #tag_user_count_map = defaultdict(lambda : defaultdict(int)) tag_user_count_map = {} #key = category #value = dict: # key = user [uid] # value = count [# of posts for this category that this user has liked] #category_user_count_map = defaultdict(lambda : defaultdict(int)) category_user_count_map = {} with open(trainPosts_loc, 'r') as f: for line_number, line_text in enumerate(f): if lines_to_read != -1 and line_number >= lines_to_read: break blog_json = json.loads(line_text) blog = blog_json['blog'] post_id = blog_json['post_id'] author = blog_json['author'] tags = blog_json['tags'] categories = blog_json['categories'] date_struct = datetime.strptime(blog_json['date_gmt'], '%Y-%m-%d %H:%M:%S') date_string = date_struct.strftime('%Y-%m-%d') if populate_for_first_four_weeks and date_string >= wk_5_start_date: continue blog_post_count_map[blog] += 1 blog_author__post_count_map[blog + '_' + author] += 1 for tag in tags: #tag_blog_count_map[tag][blog] += 1 increment_by_one(tag_blog_count_map, tag, blog) for category in categories: #category_blog_count_map[category][blog] += 1 increment_by_one(category_blog_count_map, category, blog) for like in blog_json['likes']: uid = like['uid'] user_likes_map[uid] += [(blog, post_id, author)] user_liked_blogs_map[uid].add(blog) blog_liked_users_map[blog].add(uid) for tag in tags: #tag_user_count_map[tag][uid] += 1 increment_by_one(tag_user_count_map, tag, uid) for category in categories: #category_user_count_map[category][uid] += 1 increment_by_one(category_user_count_map, category, uid) logger.info('finished populating data structures') pickle(blog_post_count_map, 'blog_post_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(blog_author__post_count_map, 'blog_author__post_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(user_likes_map, 'user_likes_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(user_liked_blogs_map, 'user_liked_blogs_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(blog_liked_users_map, 'blog_liked_users_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(tag_blog_count_map, 'tag_blog_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(category_blog_count_map, 'category_blog_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(tag_user_count_map, 'tag_user_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks)) pickle(category_user_count_map, 'category_user_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks))