Exemplo n.º 1
0
                             strip_accents='unicode',
                             norm='l2',
                             sublinear_tf=True)
tfRawMatrix = vectorizer.fit_transform(lines[0:2000])
tfRawMatrix
print(tfRawMatrix)
print("Data dimensions: {}".format(tfRawMatrix.shape))
vectorizer.get_feature_names()
tfdtm = tfRawMatrix.toarray()
#convert the dtm to numpy array
tfdtm = np.array(tfdtm)
print(tfdtm)
tfVocab = np.array(vectorizer.get_feature_names())
print(tfVocab[79])

vectorizer._document_frequency()

tfdtm[1, 204]

#?how come the tfIdf score for the idex 79 i.e. flatline =1
'''Performign the count vectorization which is same as finding the bag of words'''
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(min_df=0.006,
                             stop_words=stopwordList,
                             strip_accents='unicode',
                             binary=False)
rawdtm = count_vect.fit_transform(lines[0:2000])
vocab = count_vect.get_feature_names()
#convert the dtm to regular array
dtm = rawdtm.toarray()