def wordcloud(datafile): #remove stop words, the most common words in a language vectorizer=CountVectorizer(stop_words='english') for word in vectorizer.get_stop_words(): STOPWORDS.add(word) STOPWORDS.add("said") pony_mask = np.array(Image.open("../pinkyB.jpg")) wc = WordCloud(background_color="black", max_words=2000, mask=pony_mask, stopwords=STOPWORDS) #init dictionary with the five categories categoriesSet = set(datafile["Category"]) categoriesDict = dict.fromkeys(categoriesSet,"") #Conditional Selection # business = datafile.ix[datafile["Category"]=="Business"] # print business["Content"].size #fill index with data from cv for index, row in datafile.iterrows(): categoriesDict[row["Category"]] += str(row["Content"]) for category, text in categoriesDict.iteritems(): wc.generate(text) image = wc.to_image() image.save("../wordcloud/wordcloud_" + category + ".jpg") return
def test_countvectorizer_stop_words(): cv = CountVectorizer() cv.set_params(stop_words="english") assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS) cv.set_params(stop_words="_bad_str_stop_") assert_raises(ValueError, cv.get_stop_words) cv.set_params(stop_words="_bad_unicode_stop_") assert_raises(ValueError, cv.get_stop_words) stoplist = ["some", "other", "words"] cv.set_params(stop_words=stoplist) assert_equal(cv.get_stop_words(), stoplist)
def test_countvectorizer_stop_words(): cv = CountVectorizer() cv.set_params(stop_words='english') assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS) cv.set_params(stop_words='_bad_str_stop_') assert_raises(ValueError, cv.get_stop_words) cv.set_params(stop_words='_bad_unicode_stop_') assert_raises(ValueError, cv.get_stop_words) stoplist = ['some', 'other', 'words'] cv.set_params(stop_words=stoplist) assert_equal(cv.get_stop_words(), set(stoplist))
def test_countvectorizer_stop_words(): cv = CountVectorizer() cv.set_params(stop_words='english') assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS) cv.set_params(stop_words='_bad_str_stop_') assert_raises(ValueError, cv.get_stop_words) cv.set_params(stop_words='_bad_unicode_stop_') assert_raises(ValueError, cv.get_stop_words) stoplist = ['some', 'other', 'words'] cv.set_params(stop_words=stoplist) assert_equal(cv.get_stop_words(), stoplist)
def initialize(texts_train, texts_test): v = CountVectorizer(stop_words="english") s = v.get_stop_words() stop_w = [x for x in s] stop = {} for w in stop_w: stop[w] = 1 print("Preprocessing training file") ps = PorterStemmer() texts_train_p = [ preprocess(texts_train[i], stop, ps) for i in list(range(0, len(texts_train))) ] print("Preprocessing test file") texts_test_p = [ preprocess(texts_test[i], stop, ps) for i in list(range(0, len(texts_test))) ] print("Generating dictionary") dictionary = gensim.corpora.Dictionary(texts_train_p) dictionary.filter_extremes(no_below=15, no_above=0.5) bow_corpus = [dictionary.doc2bow(doc) for doc in texts_train_p] test_corpus = [dictionary.doc2bow(doc) for doc in texts_test_p] return dictionary, bow_corpus, test_corpus
def count_words(text): vectorizer = CountVectorizer(token_pattern='\w+') corpus = [text] X = vectorizer.fit_transform(corpus) keys = vectorizer.get_feature_names() stops = vectorizer.get_stop_words() countList = X.toarray()[0] final_dict = {str(keys[i]): countList[i] for i in range(0, len(keys))} return final_dict
def count_words(text): vectorizer = CountVectorizer(token_pattern= '\w+') corpus = [text] X = vectorizer.fit_transform(corpus) keys = vectorizer.get_feature_names() stops = vectorizer.get_stop_words() countList = X.toarray()[0] final_dict = {str(keys[i]): countList[i] for i in range(0, len(keys))} return final_dict
def Common_Vectorizer_usage(): from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=1) corpus = [ 'This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?', ] analyze = vectorizer.build_analyzer() print analyze("This is a text document to analyze.") print analyze("This is a text document to analyze.") == ['this', 'is', 'text', 'document', 'to', 'analyze'] X=vectorizer.fit_transform(corpus) print vectorizer.get_feature_names() print vectorizer.vocabulary_ #.get('document') print vectorizer.transform(['Something completely new.']).toarray() print list(X) #bigram======================================================== bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1) analyze = bigram_vectorizer.build_analyzer() print analyze('Bi-grams are cool!') X_2 = bigram_vectorizer.fit_transform(corpus).toarray() print X_2 feature_index = bigram_vectorizer.vocabulary_.get('is this') print X_2[:, feature_index] #marui test print '\n\nmarui test=====================' def t_preprocessor(s): return ','.join([x.lower() for x in s.split(' ')]) stop_words1=['is','a','this'] #is ok: frozenset(['a', 'this', 'is']) stop_words2={'is':0,'a':1,'this':2} #is ok: convert to frozenset(['a', 'this', 'is']) cv = CountVectorizer(preprocessor=t_preprocessor,stop_words=stop_words2) params=cv.get_params() print 'get_params()',type(params),'---------------' for k in params: print k,'\t',params[k] print 'get_params end--------------' print '\nget_stop_words=',cv.get_stop_words() cv.fit(corpus) print cv.get_feature_names() print cv.transform(corpus).toarray() print '\n测试preprocesser, result:\t',cv.build_preprocessor()('this is a document') print '\n测试tokenizer,result',cv.build_tokenizer()('this is a document') print '\n测试tokenizer2,result',cv.build_tokenizer()('th-is is a document') print '\n测试tokenizer2,result',cv.build_tokenizer()('th_is is a document') print '\n测试tokenizer2,result',cv.build_tokenizer()('th&is is a document') """
def test_count_vectorizer_stopwords(): cv = CountVectorizer(stop_words="english") assert "all" in cv.get_stop_words() matrix = cv.fit_transform(DOCUMENTS) expected_features = ['ate', 'got', 'hens', 'kings', 'men', 'sleep', 'tired', 'went', 'zzz'] assert cv.get_feature_names() == expected_features expected_vals = np.array([ [0, 0, 0, 1, 1, 0, 0, 0, 0], # "all the kings men" [1, 0, 1, 1, 0, 0, 0, 0, 0], # "ate all the kings hens" [0, 1, 0, 0, 0, 1, 1, 1, 1] # "until they all got tired and went to sleep zzz" ]) assert np.array_equal(matrix.toarray(), expected_vals)
def feature(corpus): """ sklearn里面的TF-IDF主要用到了两个函数:CountVectorizer()和TfidfTransformer()。 CountVectorizer是通过fit_transform函数将文本中的词语转换为词频矩阵。 矩阵元素weight[i][j] 表示j词在第i个文本下的词频,即各个词语出现的次数。 通过get_feature_names()可看到所有文本的关键字,通过toarray()可看到词频矩阵的结果。 TfidfTransformer也有个fit_transform函数,它的作用是计算tf-idf值。 """ vectorizer = CountVectorizer() # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频 transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值 tf_idf = transformer.fit_transform(vectorizer.fit_transform(corpus)) print('%%%%%%%%%') print(tf_idf) print('%%%%%%%%%') word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语 print('&&&&&&&&&') print(word) print('&&&&&&&&&') print('$$$$$$$$$') weight = tf_idf.toarray() # 将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重 print(weight) print('$$$$$$$$$') train_x, test_x = train_test_split(tf_idf, test_size=0.2) # scores = [] # for i in range(2, 21): # km = KMeans(n_clusters=i) # km.fit(train_x) # label = km.labels_ # print(label) # print(km.inertia_) # 用来评估簇的个数是否合适,距离越小说明簇分的越好,选择临界点的簇的个数 # scores.append({-km.score(test_x): i}) # 确定簇的个数 # return 19 km = KMeans(n_clusters=19) km.fit(train_x) order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() print(vectorizer.get_stop_words()) for i in range(19): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print()
def test_countvectorizer_stop_words(): cv = CountVectorizer() cv.set_params(stop_words='english') assert cv.get_stop_words() == ENGLISH_STOP_WORDS cv.set_params(stop_words='_bad_str_stop_') with pytest.raises(ValueError): cv.get_stop_words() cv.set_params(stop_words='_bad_unicode_stop_') with pytest.raises(ValueError): cv.get_stop_words() stoplist = ['some', 'other', 'words'] cv.set_params(stop_words=stoplist) assert cv.get_stop_words() == set(stoplist)
def main(): # load stop words vectorizer = CountVectorizer(stop_words='english') stop_word_set = vectorizer.get_stop_words() stop_word_set = stop_word_set.union(set(stopwords.words('english'))) D_raw = json.load(open(input_file, "r")) # D: [{'entity':entity_string,'abstract':[word]}] D = [] for i, j in enumerate(D_raw): if D_raw[i]['abstract']: D_raw[i]['abstract'] = re.sub( "[^a-z]", " ", D_raw[i]['abstract'].strip().lower()).split() if D_raw[i]['abstract']: D.append(D_raw[i]) with open(documents_file, "wb") as f: pickle.dump(D, f) # make each document a counter doc_counter_list = [] for i in range(len(D)): c = Counter() for w in D[i]['abstract']: if not w in stop_word_set: c[w] += 1 doc_counter_list.append(c) with open(doc_counter_list_file, "wb") as f: pickle.dump(doc_counter_list, f) # for each word, record which documents it appears # {'word': [document]} word_appear_doc_dict = dict() for d in range(len(D)): for word in D[d]['abstract']: if not word in stop_word_set: if word in word_appear_doc_dict: if not d in word_appear_doc_dict[word]: word_appear_doc_dict[word].append(d) else: word_appear_doc_dict[word] = [d] with open(word_appear_doc_dict_file, "wb") as f: pickle.dump(word_appear_doc_dict, f)
def BoF(data, custom_tokenizer=None, tfidf=False, tokenize=True, strip_accents=None, tokenizer=None, stop_words=None, token_pattern='(?u)\b\w\w+\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None): if tokenize: if custom_tokenizer: bag = CountVectorizer(tokenizer=custom_tokenizer, min_df=min_df, max_df=max_df, max_features=max_features, ngram_range=ngram_range, stop_words=stop_words).fit(data) elif tfidf: bag = TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=max_features, ngram_range=ngram_range, stop_words=stop_words).fit(data) else: bag = CountVectorizer(min_df=min_df, max_df=max_df, max_features=max_features, ngram_range=ngram_range, stop_words=stop_words).fit(data) X = bag.transform(data) print( "Vocabulary size: {}, training set size: {} samples * {} features". format(len(bag.get_feature_names()), X.shape[0], X.shape[1])) print('# of tokens automatically excluded from the vocabulary:', len(bag.stop_words_)) stopwords_eff = bag.get_stop_words() if stopwords_eff: print('# of stopwords that were effectively excluded :', len(stopwords_eff)) return X, bag
def tokenize_sentence(self, sentence, pre_process_sentence=None) -> np.ndarray: """ Creates an array that contains all the words that appear on the sentence, after being processed by the fit_transform The count vectorizer.fit_transform creates a document by term matrix with one document = the sentence """ if pre_process_sentence is None: pre_process_method = self.pre_process_corpus else: pre_process_method = pre_process_sentence vectorizer = CountVectorizer( token_pattern=self.token_pattern['token_pattern'], preprocessor=pre_process_method, stop_words='english' if self.args['stopwords'] else None, ) tokenized = list() # If there is at least one word that is not a stop word process down below if not self.is_all_stop_words( sentence, vectorizer.get_stop_words()) and len(sentence) != 0: bow = vectorizer.fit_transform([sentence]) # print(bow) words = vectorizer.get_feature_names() # I need this print for testing purposes # print(pd.DataFrame(bow.toarray(), columns=words)) i = 0 for count in bow.data: # print(count) for ind in range(count.item()): # print(words[i]) tokenized.append(words[i]) i = i + 1 if len(tokenized) == 0: print('tokenized is empty') tokenized.append('') return np.asarray(tokenized)
def get_label(self, corpus, n_cluster=5): """ 经过matplotlib作图可知最好的簇的个数为5 :param corpus :param n_cluster: :return: """ vectorizer = CountVectorizer() transformer = TfidfTransformer() tf_idf = transformer.fit_transform(vectorizer.fit_transform(corpus)) train_x, test_x = train_test_split(tf_idf, test_size=0.2) km = KMeans(n_clusters=n_cluster) km.fit(train_x) order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() print(vectorizer.get_stop_words()) for i in range(n_cluster): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print('\n')
>>> X.toarray()[0] array([1, 1, 1, 1, 1, 0, 1], dtype=int64) >>> X.toarray()[1,2] 1 >>> from sklearn.datasets import fetch_20newsgroups >>> categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'] >>> twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) >>> from sklearn.feature_extraction.text import CountVectorizer >>> vectorizer = CountVectorizer() >>> train_counts = vectorizer.fit_transform(twenty_train.data) >>> vectorizer.vocabulary_.get('algorithm') 4690 >>> len(vectorizer.get_feature_names()) 35788 >>> vectorizer = CountVectorizer(stop_words='english') >>> sorted(vectorizer.get_stop_words())[:20] ['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst'] >>> import nltk >>> s = nltk.stem.SnowballStemmer('english') >>> s.stem("cats") 'cat' >>> nltk.download() showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml True >>> from nltk.tokenize import word_tokenize >>> text = word_tokenize("And now for something completely different") >>> text ['And', 'now', 'for', 'something', 'completely', 'different'] >>> nltk.pos_tag(text) [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ')] >>> from sklearn.feature_extraction.text import CountVectorizer
def __call__(self, articles): for t in word_tokenize(articles): t = self.wnl.lemmatize(t) t = [re.sub('\S*@\S*\s?', '', sent) for sent in t] return t categories = [ 'alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med' ] twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) count_vect_1 = CountVectorizer() # vocabulary 1 count_vect_2 = CountVectorizer(tokenizer=LemmaTokenizer( )) # keeps words of 3 or more characters) # vocabulary 2 count_vect_1.fit_transform(twenty_train.data) count_vect_2.fit_transform(twenty_train.data) print(len(set(count_vect_1.get_feature_names()))) print(count_vect_1.get_stop_words()) print(len(count_vect_2.get_feature_names()))
capabilities. === Post 2 with dist=0.92: Most imaging databases safe images permanently. === Post 3 with dist=0.77: Imaging databases store data. === Post 4 with dist=0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data. Best post is 3 with dist=0.77 ''' ### Removing less important words # called stop words, appear everywhere, carry little information vectorizer = CountVectorizer(min_df=1, stop_words='english') # usual stop words in english sorted(vectorizer.get_stop_words())[0:20] ''' ['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', ''' # 18 words now len(vectorizer.get_feature_names()) best_post(dist_norm) ''' === Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff. === Post 1 with dist=0.86: Imaging databases provide storage capabilities. === Post 2 with dist=0.86: Most imaging databases safe images
d = dist_norm(post_vec, new_post_vec) print(i, d, post) if d<best_dist: best_dist = d best_i = i print print(best_i) print(best_dist) print(posts[best_i]) vect_stop = CountVectorizer(min_df=1, stop_words='english') print(vect_stop.get_stop_words()) eng_stremmer = nltk.stem.SnowballStemmer('english') class StemmedCountVectorizer(CountVectorizer): def build_analyzer(self): analyzer = super(StemmedCountVectorizer, self).build_analyzer() return lambda doc: (eng_stremmer.stem(w) for w in analyzer(doc)) vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english') x_train = vectorizer.fit_transform(posts) print(vectorizer.get_feature_names()) print(x_train.toarray()) class StemmedTfidVectorizer(TfidfVectorizer):
for i in range(0, num_samples): post = z[i] if(post==new_post): continue post_vec = X_train.getrow(i) d = dist_norm(post_vec, new_post_vec) print " Post %i with dist= %.2f: %s" % (i, d, post) if d < best_dist: best_dist = d best_i = i print "Best post is %i with dist= %.2f" % (best_i, best_dist) vectorizer = CountVectorizer(min_df = 1, stop_words = 'english') print sorted(vectorizer.get_stop_words())[:20] # With stemmer best_doc = None best_dist = sys.maxint best_i = None for i in range(0, num_samples): post = z[i] if(post==new_post): continue post_vec = X_train.getrow(i) d = dist_norm(post_vec, new_post_vec) print " Post %i with dist= %.2f: %s" % (i, d, post) if d < best_dist: best_dist = d best_i = i
####### LDA ######## # Use tf features for LDA. print("Extracting tf features for LDA...") #WITH FUNKY STOP WORDS tf_vectorizer = CountVectorizer(max_df=0.95, min_df=1, max_features=n_features, stop_words = text.ENGLISH_STOP_WORDS.union(['cooper', 'students', 'faculty', 'free', 'value', 'critical', 'thinking', 'education', 'work', 'school', 'skills', 'experience', 'union', 'learning', 'think', 'tuition', 'professors', 'time', 'student', 'learn', 'small', 'community', 'ability', 'learned', 'problem', 'solving', 'life', 'art', 'ideas', 'body', 'institution', 'quality', 'engineering', 'environment', 'career', 'peers', 'strong', 'different', 'debt', 'creative', 'rigor', 'rigorous', 'diverse', 'working', 'classes', 'people', 'exposure', 'focus', 'good', 'helped', 'great', 'class', 'did', 'like', 'world', 'new', 'technical', 'prepared', 'scholarship', 'hard', 'years', 'taught', 'way', 'unique', 'critically', 'freedom', 'program', 'allowed', 'challenging', 'lot', 'able', 'having', 'academic', 'professional', 'valued', 'classmates', 'ethic', 'real', 'field', 'high', 'study', 'architecture', 'undergraduate', 'opportunity', 'valuable', 'problems', 'nyc', 'research', 'design', 'really', 'diversity', 'commitment', 'intelligent', 'intellectual' 'graduate', 'dedication', 'access', 'passionate', 'culture', 'appreciate', 'amazing', 'better', 'experiences', 'understanding', 'opportunities', 'artists', 'foundation', 'major', 'degree', 'course', 'difficult', 'smart', 'institutions', 'graduate', 'intellectual', 'merit', 'city', 'development', 'lab', 'schools', 'pursue', 'teaching', 'job', 'succeed', 'arts', 'values', 'explore', 'communication', 'attended', 'college', 'knowledge', 'practical', 'colleagues', 'teamwork', 'group', 'future', 'resources', 'information', 'provide', 'engaged', 'approach', 'fundamentals', 'practice', 'dr', 'curriculum', 'educational', 'studies', 'artist', 'emphasis', 'tough', 'reputation', 'teachers', 'disciplines', 'engaging', 'talent', 'challenges', 'material', 'dedicated', 'excellent', 'support', 'unparalleled', 'challenged', 'truly', 'important', 'independent', 'best', 'interaction', 'didn', 've', 'talented', 'professor', 'leadership', 'teach', 'courses', 'projects', 'extremely', 'focused', 'helpful', 'independence', 'analytical', 'engagement', 'general', 'challenge', 'presentations', 'humanities', 'cu', 'perspective', 'computer', 'interdisciplinary', 'grad', 'especially', 'generally', 'humanities', 'incredible', 'brilliant', 'don', 'presentation', 'village', 'particularly', 'engineers', 'highly', 'importance', 'staff', 'civic', 'skill', 'demanding', 'artistic', 'atmosphere', 'graduating', 'fostered', 'fact', 'artistic', 'cost', 'writing', 'connections', 'critique', 'studio', 'discourse', 'instilled', 'thinker', 'curiosity', 'graduated', 'long', 'paid', 'coursework', 'background', 'provided', 'received', 'committed', 'higher', 'engineer', 'mentors', 'teacher', 'creativity', 'grateful', 'mission', 'breadth', 'status', 'collaboration', 'paying', 'shop', 'excellence', 'appreciation', 'programs', 'wealth', 'graduation', 'facilities', 'studios', 'undergrad', 'techniques', 'interviews', 'creatively', 'competitive', 'project', 'resume', 'invaluable'])) #NORMAL BORING STOP WORDS #tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words = text.ENGLISH_STOP_WORDS) tf = tf_vectorizer.fit_transform(data) lda_stop_words = tf_vectorizer.get_stop_words() print("Fitting LDA models with tf features, " "n_samples=", n_samples, " and n_features=", n_features) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) #outputs print("\nTopics in LDA model:") tf_feature_names = tf_vectorizer.get_feature_names() lda_stop_words = tf_vectorizer.get_stop_words() print_top_words(lda, tf_feature_names, n_top_words) #print("\nStop words:")
"""if all(a[i] in "0123456789" for i in range(len(a))): print("The string is an integer.")""" y="0123456789" for i in xt: for j in range(len(i)): if(i[j] not in y): s.append(i) break d = int(i) k = num2words(d) s.append(k) break print(s) #4 expanding abbrevation te='USA and GB are ...' abbrevs={'USA':'United States','GB':'Great Britain'} for ab in abbrevs: te= te.replace(ab,abbrevs[ab]) print(te) #remove stop words from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=1, stop_words='english') stop=list(sorted(vectorizer.get_stop_words())) a="Artificial intelligence is the intelligence exhibited by machine" pu = " ".join(set([ch for ch in a.split(" ") if ch not in stop])) print(pu) #remove punctuation
# split my data into train and test sets from sklearn.cross_validation import train_test_split train, test = train_test_split(lyrics, test_size=0.2, random_state=42) print train.shape; print test.shape # In[79]: #============================================================================== # Process description fields of train set #============================================================================== # tokenize the text using countvectoriser from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer(lowercase=True, stop_words='english', strip_accents='unicode') print count_vect.get_stop_words() # if wanting to use n-grams count_vect = CountVectorizer(analyzer='word', ngram_range=(1,2), lowercase=True, stop_words='english', strip_accents='unicode') # In[80]: train # In[81]: # fit the count vectoriser X_train_counts = count_vect.fit_transform(train.lyrics) X_train_counts.shape
new_post_vec = vectorizer.transform([new_post]) best_dist = sys.maxsize best_i = None #一下就是一个简单的选出最大的函数了 for i in range(0, x_train.shape[0]): curr_dist = dist_norm(new_post_vec, x_train.getrow(i)) print('post_num=%d,post_content=%s,dist=%.2f' % (i, posts[i], curr_dist)) if curr_dist < best_dist: best_dist = curr_dist best_i = i print('best_post_num=%d,post_content=%s,best_dist=%.2f' % (best_i, posts[best_i], best_dist)) #PART2 #stop_words,删除在任何帖子中都会出现的高频词,这个对帖子相似度区分不大,不应该占用和特定名次一样的权值,直接加在上面 #这样会输出English的常用词 print(sorted(vectorizer.get_stop_words())[0:20]) #PART3 # 近义词权重,image 和 images、information 两个词在上述情况下完全一样权重,显然不合理 #nltk natural language toolkit 自然语言处理在python中常用的包 #处理词干,词干就是指其它的词是由这个词衍生出来的 s = nltk.stem.SnowballStemmer('english') #使用扩展词干处理规则(这个函数决定处理词干的规则) print(s.stem('tools')) #结果为tool #PART4 #参数怎么调,不同词的权重应该是不同的,怎么设置 #我们一般认为在本个文件中出现的次数越多,在所有文件中出现的越少,他就越具有代表性 #有公式TF-IDF,在笔记中查看具体式子 from sklearn.feature_extraction.text import TfidfVectorizer #这个是继承于CountVectorizer的,你可以直接用这个算TF-IDF
print(hotel.shape) print(X_train.shape) print(y_train.shape) count = CountVectorizer(stop_words='english', tokenizer=None, ngram_range=(1, 2), min_df=1, max_df=0.9) temp = count.fit_transform( X_train['Comment'].values.astype('str')) # word count for recurrent words print(count.get_feature_names()) print("Stop Words:") print(count.get_stop_words()) print(temp.shape) #print("temp: " + temp) tdif = TfidfTransformer(norm='l1') temp2 = tdif.fit_transform(temp) # Give words different Weights #print(temp2) # nb = GaussianNB() mn = MultinomialNB() # Must convert to dense matrix for GaussianNB # X = temp2.todense() # nb.fit(X, hotel['Rating'])
for i in range(0, num_samples): post = posts[i] if post == new_post: continue post_vec = X_train.getrow(i) d = dist_raw(post_vec, new_post_vec) print("=== Post %i with dist=%.2f: %s" % (i, d, post)) if d < best_dist: best_dist = d best_i = i print("Best post is %i with dist=%.2f" % (best_i, best_dist)) vectorizer = CountVectorizer( min_df=1, stop_words='english') #stopwords是停用词,如果定义了就会有很多类似于most啊、a啊、about啊不被统计 sorted(vectorizer.get_stop_words())[:50] # 大约有多少呢? 318个 len(vectorizer.get_stop_words()) # 同语义的词的去重,需要下载一个包....好像不用》。。。!!! # 正统的叫法是词干处理~ from nltk import stem english_stemmer = stem.SnowballStemmer('english') # 有很多,英语的用Snowball吧 english_stemmer.stem('imaging') english_stemmer.stem('image') english_stemmer.stem('imagine') english_stemmer.stem('buys') english_stemmer.stem('buying') english_stemmer.stem('bought')
# - **Why:** They don't tell you much about your text # show vectorizer options vect # - **stop_words:** string {'english'}, list, or None (default) # - If 'english', a built-in stop word list for English is used. # - If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. # - If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms. # remove English stop words vect = CountVectorizer(stop_words='english') tokenize_test(vect) # set of stop words print vect.get_stop_words() # ## Part 5: Other CountVectorizer Options # - **max_features:** int or None, default=None # - If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. # remove English stop words and only keep 100 features vect = CountVectorizer(stop_words='english', max_features=100) tokenize_test(vect) # all 100 features print vect.get_feature_names() # include 1-grams and 2-grams, and limit the number of features vect = CountVectorizer(ngram_range=(1, 2), max_features=100000) tokenize_test(vect)
from sklearn.feature_extraction.text import CountVectorizer from termcolor import colored import os # -------start part different from i3covert_rawtext_to_bagOfwords.py ------ vectorizer = CountVectorizer(min_df=1, stop_words='english') print(colored(sorted(vectorizer.get_stop_words())[0:20], 'blue')) # ------- end part different from i3covert_rawtext_to_bagOfwords.py ------ print('dir(vectorizer)=', dir(vectorizer), '\nvectorizer=', vectorizer, '\n') content = ["How to format my hard disk", "Hard disk format problems "] X = vectorizer.fit_transform(content) print('vectorizer.get_feature_names()=', vectorizer.get_feature_names()) print(X.toarray().transpose()) print(colored('*' * 25, 'red')) from i2utils import DATA_DIR TOY_DIR = os.path.join(DATA_DIR, "toy") posts = [open(os.path.join(TOY_DIR, f)).read() for f in os.listdir(TOY_DIR)] print('posts:', posts) X_train = vectorizer.fit_transform(posts) num_samples, num_features = X_train.shape print("#samples: %d, #featues: %d" % (num_samples, num_features)) print('vectorizer.get_feature_names()=', vectorizer.get_feature_names())
# show vectorizer options vect # - **stop_words:** string {'english'}, list, or None (default) # - If 'english', a built-in stop word list for English is used. # - If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. # - If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms. # remove English stop words vect = CountVectorizer(stop_words='english') tokenize_test(vect) # set of stop words print vect.get_stop_words() # ## Part 4: Other CountVectorizer Options # - **max_features:** int or None, default=None # - If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. # remove English stop words and only keep 100 features vect = CountVectorizer(stop_words='english', max_features=100) tokenize_test(vect) # all 100 features print vect.get_feature_names()
number_of_features_bigram = len(vectorizer_bigram.get_feature_names()) list_of_features_bigram = vectorizer_bigram.get_feature_names() X = vectorizer.fit_transform(corpus) #corpus_array = X.toarray() number_of_features = len(vectorizer.get_feature_names()) list_of_features = vectorizer.get_feature_names() number_of_features = number_of_features_unigram + number_of_features_bigram list_of_features = list_of_features_unigram + list_of_features_bigram print "list_of_features:" print list_of_features print "list of features:%d" % number_of_features print "#######vectorizer stop words############" print vectorizer.get_stop_words() print "#######vocabulary########" print vectorizer.vocabulary_ transformer_unigram = TfidfTransformer(norm='', smooth_idf=True) transformer_bigram = TfidfTransformer(norm='', smooth_idf=True) transformer = TfidfTransformer(norm='', smooth_idf=True) tfidf = transformer.fit_transform(X.toarray()) tfidf_array = tfidf.toarray() tfidf_unigram = TfidfTransformer(norm='', smooth_idf=True) tfidf_bigram = TfidfTransformer(norm='', smooth_idf=True) tfidf_unigram_array = tfidf_unigram.fit_transform(X_unigram) tfidf_bigram_array = tfidf_bigram.fit_transform(X_bigram)
train_indices = indices[:train_count] validate_indices = indices[train_count:] train_samples = samples.iloc[train_indices] validate_samples = samples.iloc[validate_indices] print "train sample count {}, validate sample count {}".format( len(train_samples), len(validate_samples)) # get tf-idf vector print 'fitting train samples' count_vect = CountVectorizer(stop_words='english', max_df=1.0) count_vect.fit(train_samples['Text']) print count_vect.get_feature_names() print count_vect.get_stop_words() X_train_counts = count_vect.transform(train_samples['Text']) if TEST: X_test_counts = count_vect.transform(test_samples['Text']) else: X_validate_counts = count_vect.transform(validate_samples['Text']) tfidf_transformer = TfidfTransformer() tfidf_transformer.fit(X_train_counts) X_train_tfidf = tfidf_transformer.transform(X_train_counts) if TEST: X_test_tfidf = tfidf_transformer.transform(X_test_counts) else: X_validate_tfidf = tfidf_transformer.transform(X_validate_counts) print 'feature size {}'.format(X_train_tfidf.shape)
# 단어 사전 mat = vectorizer.fit(sentences) print(type(mat)) print(mat.vocabulary_) # 알파벳 순으로 index가 붙는다 print(sorted(mat.vocabulary_.items())) # 토큰 features = vectorizer.get_feature_names() print(type(features)) print(features) print('불용어') print(vectorizer.get_stop_words()) sentence = [sentences[0]] print('sentence: ', sentence) myarray = vectorizer.transform(sentence).toarray() print(type(myarray)) print('myarray: ', myarray) """ sentence: ['우리 아버지 여자 친구 이름은 홍길동 홍길동'] 단어 사전: {'여자': 0, '이름은': 1, '홍길동': 2} myarray: [[1 1 2]] -> sentence에 단어 사전의 토큰 중 '여자' 1번, '이름은' 1번, '홍길동' 2번 포함되어 있다는 것을 ndarray로 리턴 """
import pandas as pd pd.DataFrame(X.toarray(), columns=vec.get_feature_names()) from sklearn.feature_extraction.text import TfidfVectorizer corpus = [ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ] corpus vec = TfidfVectorizer() X = vec.fit_transform(sample) vec.get_feature_names() X =vec.fit_transform(corpus) vec.get_feature_names() pd.DataFrame(X.toarray(),columns = vec.get_feature_names()) vec.get_stop_words() def savePrint(): print("Hello fro VIM")
class GeneralCountVectorizer(_GeneralBaseVectorizer): def __init__(self, clusters={}, vocabulary=None, max_features=None, ngram_range=(1, 1), stop_words=None, token_pattern=r"(?u)\b\w[\w']+\b", analyzer='word', max_df=1.0, min_df=1, dim=100): # base constructor super(GeneralCountVectorizer, self).__init__(clusters, vocabulary, dim) # count vectorizer self.vec = CountVectorizer(max_features=max_features, ngram_range=ngram_range, stop_words=stop_words, token_pattern=token_pattern, analyzer=analyzer, vocabulary=self.vocabulary, max_df=max_df, min_df=min_df) def __repr__(self): class_name = self.__class__.__name__ return '%s(%s)' % ( class_name, _pprint( self.get_params(deep=False), offset=len(class_name), ), ) def get_params(self, deep=False): out = {} """out = {**self.base_params, **self.vec.get_params(deep=deep)}""" return out def get_feature_names(self): return self.vec.get_feature_names() def get_stop_words(self): return self.vec.get_stop_words() def fit(self, X, y=None): X_mapped = self._map(X) if self._is_clusters else X self.vec.fit(X_mapped) return self def transform(self, X): if not get_len(X): # X is an empty of elements, return an empty np.ndarray return np.empty((0, self.dim)) X_mapped = self._map(X) if self._is_clusters else X return self.vec.transform(X_mapped) def fit_transform(self, X, y=None): self.fit(X, y=y) return self.transform(X) def inverse_transform(self, X): return self.vec.inverse_transform(X)
class Tfidf: """Permet de calculer les mesures de similarités entre des speechs et des paragraphes Cela se fait en plusieurs étapes : -Tokenisation et lemmatisation de tous les documents (speechs et paragraphes) -Détermination du vocabulaire, on compte tous les mots -Calcule de plusieurs valeurs : df, idf, tf, tfidf -Calcule des mesures cosinus entre les tfidf des speechs et des paragraphes -Calcule d'informations supplémentaires sur les mesures calculées """ def __init__(self, paragraphe, speech): """Initialise les données Entrée : -paragraphe : liste des textes des paragraphes -speech : liste des textes des speechs On initialise set comme l'ensemble des textes """ self.paragraphe = paragraphe self.speech = speech self.set = list(self.paragraphe) #recopie self.set.extend(self.speech) def count(self, traitement=None): """Définit le vocabulaire, et compte le nombre de mot par document (speech et paragraphe) Entrée : -traitement : si traitement == "lemmatize", alors on utilise le tokenizer de stem.py, qui lemmatize en même sinon, on utilise le tokenizer par défaut de CountVectorizer (de sklearn), qui ne lemmatize pas Résultats : -self.tfidf_matrix : matrice creuse contenant pour chaque document, pour chaque mot, le nombre d'apparition du mot dans le document -self.vocabulary : dictionnaire contenant le vocabulaire -self.stop_words : dictionnaire contenant les stop_words """ if traitement == "lemmatize": self.tokenizer = stem.LemmaTokenizer() else: self.tokenizer = None self.tfidf_vectorizer = CountVectorizer(strip_accents='unicode', stop_words='english', tokenizer=self.tokenizer) self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.set) self.vocabulary = self.tfidf_vectorizer.vocabulary_ self.stop_words = self.tfidf_vectorizer.get_stop_words() def do_tf(self, ponderation): """Calcul du tf Le tf (term frequency), pour un mot dans un document, est le nombre d'apparition du mot dans le document. C'est donc le contenu de self.tfidf_matrix Cette fonction permet cependant une amélioration : la prise en compte du contexte. Ainsi, pour mesure la similarité entre un speech et un paragraphe, on va regarder un peu les speechs et les paragraphes autour. Pour cela, on va calculer un tf augmenté, qui va compté les mots dans un document, mais aussi les mots des documents autour avec une certaine pondération. Entrée : -ponderation : un tableau de ponderation, qui s'applique de manière symétrique autour du document observé (qui lui est pondéré à 1) exemple : ponderation de la forme [0.8,0.2] == 1*tf[i] + 0.8*tf[i+1] + 0.8*tf[i-1] + 0.2tf[i-2] + 0.2tf[i+2], tf[i] tjr pondéré à 1 Pour ne pas prendre en compte le contexte : ponderation = [] """ self.tf = sp.lil_matrix(self.tfidf_matrix, dtype=float) id_set = 0 for s in [self.paragraphe, self.speech]: for j in range(len(s)): for i in range(len(self.vocabulary)): for k,p in enumerate(ponderation): if j - (k+1) >= 0: self.tf[id_set,i] += p*float(self.tfidf_matrix[id_set-(k+1),i]) if j + (k+1) < len(s): self.tf[id_set,i] += p*float(self.tfidf_matrix[id_set+(k+1),i]) id_set += 1 #Calcul du tfidf def do_df(self): """Calcul du df Le df (document frequency), pour un mot, correspond au nombre de documents où le mot apparait. On vérifie donc, pour chaque document, si le tf du mot dans ce document est non nul """ self.df = Counter() for _, i in self.vocabulary.iteritems(): for j in range(len(self.set)): if self.tfidf_matrix[j,i] != 0: self.df[i] += 1 def do_idf(self): """Calcul de l'idf L'idf (inverse term frequency) = log(nombre de documents / df) pour un mot """ self.idf = list(map(lambda x : numpy.log10((len(self.set)) / float(x)), self.df.values())) def do_tfidf(self): """Calcul du tfidf Le tfidf, pour un mot et un document = tf*idf """ self.tfidf = [] for j in range(len(self.set)): self.tfidf.append([]) for i in range(len(self.vocabulary)): self.tfidf[j].append(0.) for j in range(len(self.set)): for k,i in self.vocabulary.iteritems(): self.tfidf[j][i] = (self.tf[j,i] * self.idf[i]) #Variantes de calcul def do_idf_variante(self): """Variante du calcul de l'idf Dans cette variante : idf = log( (nombre de document + 1) / (df + 1) ) + 1 C'est cette variante de l'idf qui est utilisé dans sklearn si on ne spécifie pas de norme (paramètre norm=None) """ self.idf = list(map(lambda x : numpy.log((len(self.set) + 1.0) / float(x + 1.0)) + 1.0, self.df.values())) def do_idf_original(self): """Calcul de l'idf directement avec sklearn On calcule ici l'idf directement avec les classe de sklearn. On obtient le même résultat que do_idf_variante. Calculer nous même l'idf nous permet de mieux contrôler ce que l'on fait, notamment sur la variante utilisée. """ tfidf_transformer = TfidfTransformer() tfidf_transformer.fit(self.tfidf_matrix) self.idf = tfidf_transformer.idf_ def do_tfidf_original(self): """Calcul du tfidf directement avec sklearn Sklearn nous permet de calculer directement les valeurs de tfidf en quelques lignes (incluant la tokenisation, le comptage, et les calculs intermédiaires). Le problème est qu'il ne gère pas le contexte, et qu'on a pas le choix des variantes de calcul. """ tfidf_v = TfidfVectorizer(strip_accents='unicode', stop_words='english', norm=None) self.tfidf = tfidf_v.fit_transform(self.set) #Mesure cosinus def mesure(self): """Calcul les mesures de similarités entre les speechs et les paragraphes avec une mesure cosinus Résultat : -self.similarite : un dictionnaire de la forme : self.similarite[idSpeech][idParagraphe] = valeur_similarite """ cosine_liste = cosine_similarity(self.tfidf[len(self.paragraphe):], self.tfidf[:len(self.paragraphe)]) #set1 to set2 self.similarite = {} for i, cosine in enumerate(cosine_liste): for j, value in enumerate(cosine): if i in self.similarite: self.similarite[i][j] = value else: self.similarite[i] = {j : value} #Informations def do_infoMesure(self): """Calcul des informations sur les mesures de similarités On calcule la moyenne et l'écart-type des similarités, ainsi que le pourcentage de zéro, par speech """ self.moyenne = {} self.ecartType = {} self.percentZero = {} for id_speech,speech in self.similarite.iteritems(): somme = 0. nbZero = 0 for v in speech.values(): somme += v if v == 0: nbZero += 1 self.moyenne[id_speech] = somme / float(len(speech)) self.percentZero[id_speech] = float((nbZero * 100)) / float(len(speech)) somme = 0. for v in speech.values(): somme += (v - self.moyenne[id_speech])**2 self.ecartType[id_speech] = numpy.sqrt(somme) / float(len(speech)) def do_matchingWords(self): """Détermine les mots en commun entre chaque speech et paragraphe (les matching words) On regarde les mots en commun dans chaque paire speech/paragraphe (tfidf non nul dans les deux documents) et on calcule leur similarité comme un produit scalaire. Attention, on utilise pas la même méthode pour la similarité entre deux mots, et la similarité entre deux documents. """ self.matchingWords = {} for j,speech in enumerate(self.tfidf[len(self.paragraphe):]): self.matchingWords[j] = {} for i,paragraphe in enumerate(self.tfidf[:len(self.paragraphe)]): self.matchingWords[j][i] = {} for w in range(len(self.vocabulary)): value = self.tfidf[len(self.paragraphe) + j][w] * self.tfidf[i][w] if value > 0.: self.matchingWords[j][i][w] = value def do_match(self, n=None): """Trie les n meilleurs similarités pour chaque speech Si n = None, on garde toutes les similarités, triées, stockées dans self.match """ self.match = {} for i,s1 in enumerate(self.speech): if n: self.match[i] = sorted(self.similarite[i].iteritems(), key=lambda (k,v) : (v,k))[-n:] else: self.match[i] = sorted(self.similarite[i].iteritems(), key=lambda (k,v) : (v,k)) #Éxécution def go(self, ponderation, n=None, lemmatizer=None): """Calcul les mesures de similarités en appliquant toutes les opérations nécessaire Entrée : -ponderation : un tableau des pondération pour le contexte (voir do_tf) -n : le nombre de paragraphes avec les meilleurs similarités que l'on veut garder par speech (voir do_match) -lemmatizer : le tokenizer/lemmatizer utilisé (voir __init__) """ self.count(lemmatizer) #print self.vocabulary print print "tf\n" self.do_tf(ponderation) print "df\n" self.do_df() print "idf\n" self.do_idf() print "tfidf\n" self.do_tfidf() print "cosine\n" self.mesure() print "info\n" self.do_match(n) print #print self.match self.do_infoMesure() self.do_matchingWords()
pass else: dict_int[idx] = item int_dict[item] = idx idx += 1 df['int_labels'] = df['label'].map(int_dict) print(df.loc[0]) y = df['int_labels'].values count_vectorizer = CountVectorizer(decode_error='ignore', stop_words='english', max_df=0.2) x = count_vectorizer.fit_transform(df['data']) print(count_vectorizer.get_stop_words()) tfidf = TfidfVectorizer(decode_error='ignore', stop_words='english', max_df=0.8) x = tfidf.fit_transform(df['data']) model = MultinomialNB() model.fit(x, y) data_vectorizer = CountVectorizer( vocabulary=count_vectorizer.get_feature_names()) print(count_vectorizer.get_feature_names()) xml_x = data_vectorizer.fit_transform(txt_array) data_tfidf = TfidfVectorizer(vocabulary=tfidf.get_feature_names())
N_TOPICS = 16 print('reading data...') dataset = fetch_20newsgroups(shuffle=False, remove=('headers', 'footers', 'quotes')) data_samples = dataset.data train_docs, test_docs = train_test_split(data_samples, random_state=42) print('priparing Count Vectorizer') tf_vectorizer = CountVectorizer(max_df=1.0, stop_words='english') X_train = tf_vectorizer.fit_transform(train_docs) X_test = tf_vectorizer.transform(test_docs) feature_names = tf_vectorizer.get_feature_names() tf_vectorizer.get_stop_words() print('Splitting test documents...') X_test_train, X_test_test = rowwise_train_test_split(X_test, random_seed=114514) print('Start fitting sk-learn model...') start = time() vb_model = LDA_vb(n_components=N_TOPICS) vb_model.fit(X_train) phi_vb = vb_model.components_ / \ vb_model.components_.sum(axis=1)[:, np.newaxis] end = time() print('done in {:.2f} seconds'.format((end - start)))
best_i = i print("Best post is %i with dist=%.2f"%(best_i, best_dist)) print(X_train.getrow(3).toarray()) print(X_train.getrow(4).toarray()) def dist_norm(v1, v2): v1_normalized = v1/sp.linalg.norm(v1.toarray()) v2_normalized = v2/sp.linalg.norm(v2.toarray()) delta = v1_normalized - v2_normalized return sp.linalg.norm(delta.toarray()) best_doc = None best_dist = sys.maxint best_i = None for i in range(0, num_samples): post = posts[i] if post==new_post: continue post_vec = X_train.getrow(i) d = dist_norm(post_vec, new_post_vec) print "=== Post %i with dist=%.2f: %s"%(i, d, post) if d < best_dist: best_dist = d best_i = i print("Best post is %i with dist=%.2f"%(best_i, best_dist)) vectorizer = CountVectorizer(min_df=1, stop_words='english') print(sorted(vectorizer.get_stop_words())[0:10])
# - **Why:** They probably don't tell you much about your text # show vectorizer parameters vect # - **stop_words:** string {'english'}, list, or None (default) # - If 'english', a built-in stop word list for English is used. # - If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. # - If None, no stop words will be used. # remove English stop words vect = CountVectorizer(stop_words='english') tokenize_test(vect) # examine the stop words print(sorted(vect.get_stop_words())) # - **max_df:** float in range [0.0, 1.0] or int, default=1.0 # - When building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). # - If float, the parameter represents a proportion of documents. # - If integer, the parameter represents an absolute count. # ignore terms that appear in more than 50% of the documents vect = CountVectorizer(max_df=0.5) tokenize_test(vect) # - **stop\_words\_:** Terms that were ignored because they either: # - occurred in too many documents (max_df) # - occurred in too few documents (min_df) # - were cut off by feature selection (max_features)
from sklearn.feature_extraction.text import CountVectorizer corpus = [ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ] vectorizer = CountVectorizer() vectorizer.fit(corpus) vectorizer.vocabulary_ vectorizer.stop_words_ vectorizer.get_stop_words() vectorizer.get_feature_names() X = vectorizer.transform(corpus) type(X) X.toarray() from sklearn.feature_extraction.text import TfidfVectorizer corpus = [ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ] vectorizer = TfidfVectorizer() vectorizer.fit(corpus)
from sklearn.feature_extraction.text import CountVectorizer from termcolor import colored import os # -------start part different from i3covert_rawtext_to_bagOfwords.py ------ vectorizer = CountVectorizer(min_df=1, stop_words='english') print(colored(sorted(vectorizer.get_stop_words())[0:20], 'blue')) # ------- end part different from i3covert_rawtext_to_bagOfwords.py ------ print('dir(vectorizer)=',dir(vectorizer),'\nvectorizer=', vectorizer,'\n') content = ["How to format my hard disk", "Hard disk format problems "] X = vectorizer.fit_transform(content) print('vectorizer.get_feature_names()=',vectorizer.get_feature_names()) print(X.toarray().transpose()) print(colored('*'*25, 'red')) from i2utils import DATA_DIR TOY_DIR = os.path.join(DATA_DIR, "toy") posts = [open(os.path.join(TOY_DIR, f)).read() for f in os.listdir(TOY_DIR)] print('posts:', posts) X_train = vectorizer.fit_transform(posts)