strip_accents='unicode', norm='l2', sublinear_tf=True) tfRawMatrix = vectorizer.fit_transform(lines[0:2000]) tfRawMatrix print(tfRawMatrix) print("Data dimensions: {}".format(tfRawMatrix.shape)) vectorizer.get_feature_names() tfdtm = tfRawMatrix.toarray() #convert the dtm to numpy array tfdtm = np.array(tfdtm) print(tfdtm) tfVocab = np.array(vectorizer.get_feature_names()) print(tfVocab[79]) vectorizer._document_frequency() tfdtm[1, 204] #?how come the tfIdf score for the idex 79 i.e. flatline =1 '''Performign the count vectorization which is same as finding the bag of words''' from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer(min_df=0.006, stop_words=stopwordList, strip_accents='unicode', binary=False) rawdtm = count_vect.fit_transform(lines[0:2000]) vocab = count_vect.get_feature_names() #convert the dtm to regular array dtm = rawdtm.toarray()