# exercise 3.1.4 import numpy as np from tmgsimple import TmgSimple # Generate text matrix with help of simple class TmgSimple tm = TmgSimple(filename='../Data/textDocs.txt', stopwords_filename='../Data/stopWords.txt', stem=True) # Extract variables representing data X = tm.get_matrix(sort=True) attributeNames = tm.get_words(sort=True) # Display the result print(attributeNames) print(X) print('Ran Exercise 3.1.4')
# exercise 3.1.2 from tmgsimple import TmgSimple # Generate text matrix with help of simple class TmgSimple tm = TmgSimple(filename='../Data/textDocs.txt', ) # Extract variables representing data X = tm.get_matrix(sort=True) attributeNames = tm.get_words(sort=True) # Display the result print(attributeNames) print(X)
document at least contains 2 of your key words, i.e. the document-term matrix should have approximately 10 columns and each row of the matrix must at least contain 2 non-zero entries. """ bagOfWords = ['matrix', 'Google', 'ranking', 'web', 'webpage', 'rank'] """ 3.1.2 """ import numpy as np from tmgsimple import TmgSimple from similarity import similarity # Generate text matrix with help of simple class TmgSimple tm = TmgSimple(filename='../02450Toolbox_Python/Data/textDocs.txt', ) # Extract variables representing data X = tm.get_matrix(sort=True) attributeNamesWithOutStop = tm.get_words(sort=True) # Display the result print attributeNamesWithOutStop print X """ 3.1.3 With stopwords """ print('Now with stopwords !!!') tm = TmgSimple(filename='../02450Toolbox_Python/Data/textDocs.txt', stopwords_filename='../02450Toolbox_Python/Data/stopWords.txt')
# exercise 2.1.2 from tmgsimple import TmgSimple import tmgsimple #help(tmgsimple) fn='C:\\Users\\Bahram\\PycharmProjects\\Machine-Learning-and-Data-Mining\\02450Toolbox_Python\\Data\\textDocs.txt' stopwords='C:\\Users\\Bahram\\PycharmProjects\\Machine-Learning-and-Data-Mining\\02450Toolbox_Python\\Data\\stopWords.txt' tm = TmgSimple(filename=fn,stopwords_filename=stopwords,stem=True,min_term_length=5) attributeNames = tm.get_words(sort=True) x=tm.get_matrix(sort=True) print attributeNames print x """ # Generate text matrix with help of simple class TmgSimple # Extract variables representing data X = tm.get_matrix(sort=True) # Display the result print attributeNames print X """