예제 #1
0
	def extract_users (self, calendar_df_iterator):
		"""
			given an iterator over calendar dataframes,
			this constructs and returns a dataframe 
			containing all users
		"""
		print_header ("EXTRACTING USERS")
		#==========[ ITERATE OVER ALL DFS	]==========
		for cdf in calendar_df_iterator ():
			print_status ("Extract users", "next df")

			#=====[ Step 1: sort by user	]=====
			print_inner_status ("extract_users", "sorting by user id")
			cdf = cdf.sort ('user')

			#=====[ Step 2: init user representations	]=====
			print_inner_status ("extract_users", "initializing user representations")
			unique_uids = [uid for uid in cdf['user'].unique ()]
			for uid in unique_uids:
				if not uid in self.user_representations:
					self.user_representations[uid] = self.init_user_representation(uid)

			#=====[ Step 3: update the user representations	]=====
			print_inner_status ("extract_users", "updating user representations")			
			cdf.apply (self.update_user_representation, axis = 1)

		#=====[ Step 4: convert to df, delete irrelevant stuff	]=====
		print_inner_status ("extract_users", "converting to dataframe")		
		self.users_df = pd.DataFrame(self.user_representations.values())
		del self.user_representations
		return self.users_df
예제 #2
0
파일: SpotOn.py 프로젝트: jayhack/SpotOn
	def get_corpus_dictionary (self):
		"""
			PRIVATE: get_corpus_dictionary
			------------------------------
			Assembles a gensim corpus and dictionary from activities_df,
			where each text is name || words.
		"""
		#=====[ Step 1: iterate through all activity dataframes	]=====
		print_status ("get_corpus", "assembling texts")
		texts = []
		for df in self.storage_delegate.iter_activity_dfs ():
			print_inner_status ("assembling texts", "next df")
			texts += list(df.apply(self.extract_text, axis=1))

		#=====[ Step 3: get dictionary	]=====
		print_status ("get_corpus", "assembling dictionary")
		dictionary = gensim.corpora.Dictionary(texts)

		#=====[ Step 4: get corpus	]=====
		print_status ("get_corpus", "assembling corpus")		
		corpus = [dictionary.doc2bow (text) for text in texts]

		return corpus, dictionary