def extract_users (self, calendar_df_iterator): """ given an iterator over calendar dataframes, this constructs and returns a dataframe containing all users """ print_header ("EXTRACTING USERS") #==========[ ITERATE OVER ALL DFS ]========== for cdf in calendar_df_iterator (): print_status ("Extract users", "next df") #=====[ Step 1: sort by user ]===== print_inner_status ("extract_users", "sorting by user id") cdf = cdf.sort ('user') #=====[ Step 2: init user representations ]===== print_inner_status ("extract_users", "initializing user representations") unique_uids = [uid for uid in cdf['user'].unique ()] for uid in unique_uids: if not uid in self.user_representations: self.user_representations[uid] = self.init_user_representation(uid) #=====[ Step 3: update the user representations ]===== print_inner_status ("extract_users", "updating user representations") cdf.apply (self.update_user_representation, axis = 1) #=====[ Step 4: convert to df, delete irrelevant stuff ]===== print_inner_status ("extract_users", "converting to dataframe") self.users_df = pd.DataFrame(self.user_representations.values()) del self.user_representations return self.users_df
def get_corpus_dictionary (self): """ PRIVATE: get_corpus_dictionary ------------------------------ Assembles a gensim corpus and dictionary from activities_df, where each text is name || words. """ #=====[ Step 1: iterate through all activity dataframes ]===== print_status ("get_corpus", "assembling texts") texts = [] for df in self.storage_delegate.iter_activity_dfs (): print_inner_status ("assembling texts", "next df") texts += list(df.apply(self.extract_text, axis=1)) #=====[ Step 3: get dictionary ]===== print_status ("get_corpus", "assembling dictionary") dictionary = gensim.corpora.Dictionary(texts) #=====[ Step 4: get corpus ]===== print_status ("get_corpus", "assembling corpus") corpus = [dictionary.doc2bow (text) for text in texts] return corpus, dictionary