예제 #1
0
    def _makeTextMatrix(self, inputFile, stopwordFile):

        # Generate text matrix with TmgSimple from 02450
        textMatrix = TmgSimple(filename=formattedDatabase,
                               stopwords_filename=stopwordFile)

        attributeNames = textMatrix.get_words(sort=True)

        # Make an output file
        attFile = open(attributFile, 'w')
        datFile = open(dataFile, 'w')

        for word in attributeNames:
            attFile.write(word)
            attFile.write('\n')

        attFile.close

        for i in range(40):
            np.savetxt(datFile, textMatrix.get_matrix(i*1000, (i+1)*1000,
                                                      sort=True), fmt='%i')
        datFile.close
예제 #2
0
# exercise 3.1.4
import numpy as np
from tmgsimple import TmgSimple

# Generate text matrix with help of simple class TmgSimple
tm = TmgSimple(filename='../Data/textDocs.txt',
               stopwords_filename='../Data/stopWords.txt',
               stem=True)

# Extract variables representing data
X = tm.get_matrix(sort=True)
attributeNames = tm.get_words(sort=True)

# Display the result
print(attributeNames)
print(X)

print('Ran Exercise 3.1.4')
예제 #3
0
bagOfWords = ['matrix', 'Google', 'ranking', 'web', 'webpage', 'rank']

"""
3.1.2
"""
import numpy as np
from tmgsimple import TmgSimple
from similarity import similarity

# Generate text matrix with help of simple class TmgSimple
tm = TmgSimple(filename='../02450Toolbox_Python/Data/textDocs.txt', )

# Extract variables representing data
X = tm.get_matrix(sort=True)
attributeNamesWithOutStop = tm.get_words(sort=True)

# Display the result
print attributeNamesWithOutStop
print X

"""
3.1.3
With stopwords
"""
print('Now with stopwords !!!')
tm = TmgSimple(filename='../02450Toolbox_Python/Data/textDocs.txt', stopwords_filename='../02450Toolbox_Python/Data/stopWords.txt')

# Extract variables representing data
X = tm.get_matrix(sort=True)
attributeNamesWithStop = tm.get_words(sort=True)
# exercise 2.1.4

import numpy as np
from tmgsimple import TmgSimple

# Generate text matrix with help of simple class TmgSimple
tm = TmgSimple(filename='../Data/textDocs.txt', stopwords_filename='../Data/stopWords.txt', stem=True)

# Extract variables representing data
X = tm.get_matrix(sort=True)
attributeNames = tm.get_words(sort=True)

# Display the result
print attributeNames
print X