예제 #1
0
	def extractTopicWordWeights( self, model_path ):
		data = {}
		words = []
		topics = []
		
		# Read in content of file (sparse matrix representation)
		filename = '{}/{}'.format( model_path, ImportMallet.TOPIC_WORD_WEIGHTS )
		with open( filename, 'r' ) as f:
			lines = UnicodeReader( f )
			for (topic, word, value) in lines:
				topic = int(topic)
				if topic not in data:
					data[ topic ] = {}
				data[ topic ][ word ] = float(value)
				words.append( word )
				topics.append( topic )
		
		# Get list of terms and topic indexes
		term_index = sorted( list( frozenset( words ) ) )
		topic_index = sorted( list( frozenset( topics ) ) )
		
		# Build dense matrix representation
		matrix = []
		for term in term_index :
			row = []
			for topic in topic_index :
				row.append( data[ topic ][ term ] )
			matrix.append( row )
		
		# Generate topic labels
		topic_str_index = [ 'Topic {}'.format(d) for d in topic_index ]
		
		self.model.term_topic_matrix = matrix
		self.model.term_index = term_index
		self.model.topic_index = topic_str_index
예제 #2
0
 def read( self ):
     self.data = {}
     filename = self.path + TokensAPI.TOKENS
     with open( filename, 'r' ) as f:
         lines = UnicodeReader( f )
         for ( docID, docTokens ) in lines:
             self.data[ docID ] = docTokens.split( ' ' )
예제 #3
0
def ReadAsSparseMatrix( filename ):
	matrix = {}
	with open( filename, 'r' ) as f:
		lines = UnicodeReader( f )
		for ( aKey, bKey, value ) in lines:
			matrix[ (aKey, bKey) ] = float( value )
	return matrix
예제 #4
0
def ReadAsSparseVector( filename ):
	vector = {}
	with open( filename, 'r' ) as f:
		lines = UnicodeReader( f )
		for ( key, value ) in lines:
			vector[ key ] = float( value )
	return vector
예제 #5
0
def ReadAsMatrix( filename ):
	matrix = []
	with open( filename, 'r' ) as f:
		lines = UnicodeReader( f )
		for line in lines:
			matrix.append( map( float, line ) )
	return matrix
예제 #6
0
    def readCsvAsMatrixStr(self, model_path, filename):
        """
		Return a matrix (list of list) of string values.
		Each row corresponds to a line of the input file.
		Each cell (in a row) corresponds to a comma-separated value (in each line).
		"""
        data = []
        filename = '{}/{}'.format(model_path, filename)
        with open(filename, 'r') as f:
            lines = UnicodeReader(f, delimiter=',')
            data = [d for d in lines]
        return data