def plot_term_frequency_no_stop(subset): ############################################## # Visualize stopwords removal ############################################## vectorizer = CountVectorizer(stop_words='english') docs = vectorizer.fit_transform(corpus.title_words(fileids=subset)) features = vectorizer.get_feature_names() visualizer = FreqDistVisualizer(features) visualizer.fit(docs) visualizer.poof()
def plot_term_frequency(subset): ############################################## # Visualize frequency distribution of top 50 tokens ############################################## vectorizer = CountVectorizer() docs = vectorizer.fit_transform(corpus.title_words(fileids=subset)) features = vectorizer.get_feature_names() visualizer = FreqDistVisualizer(features) visualizer.fit(docs) visualizer.poof()
def hist_tokens_texts(cls, texts_dir, vectorizer, ext='txt'): reader = CorpusReader(input_folder_name=texts_dir, doc_pattern=r'(.*?/).*\.' + ext, categ_pattern=r'(.*?)/.*\.' + ext, encoding='utf-8') texts = list(reader.readfiles(fileids=reader.root_ids)) docs = vectorizer.fit_transform(texts) features = vectorizer.get_feature_names() visualizer = FreqDistVisualizer(features=features, size=(1080, 720)) visualizer.fit(docs) visualizer.show()
def freqdist(docs, outpath, corpus_kwargs={}, **kwargs): # Create a new figure and axes fig = plt.figure() ax = fig.add_subplot(111) # Vectorize the corpus vectorizer = CountVectorizer(**corpus_kwargs) docs = vectorizer.fit_transform(docs) features = vectorizer.get_feature_names() # Visualize the frequency distribution visualizer = FreqDistVisualizer(features=features, **kwargs) visualizer.fit(docs) visualizer.poof(outpath=outpath)
# Return the data bunch for use similar to the newsgroups example return Bunch( categories=categories, files=files, data=data, target=target, ) # Visualize frequency distribution of top 50 tokens vectorizer = CountVectorizer() docs = vectorizer.fit_transform(corpus.data) features = vectorizer.get_feature_names() visualizer = FreqDistVisualizer() visualizer.fit(docs, features) visualizer.poof() # Visualize stopwords removal vectorizer = CountVectorizer(stop_words='english') docs = vectorizer.fit_transform(corpus.data) features = vectorizer.get_feature_names() visualizer = FreqDistVisualizer() visualizer.fit(docs, features) visualizer.poof() # Visualize different subcorpora hobby_types = {}
return Bunch( categories=categories, files=files, data=data, target=target, ) corpus = load_corpus('hobbies') # Visualize frequency distribution of top 50 tokens vectorizer = CountVectorizer() docs = vectorizer.fit_transform(corpus.data) features = vectorizer.get_feature_names() visualizer = FreqDistVisualizer(features) visualizer.fit(docs) visualizer.poof() # Visualize stopwords removal vectorizer = CountVectorizer(stop_words='english') docs = vectorizer.fit_transform(corpus.data) features = vectorizer.get_feature_names() visualizer = FreqDistVisualizer(features) visualizer.fit(docs) visualizer.poof() # Visualize different subcorpora hobby_types = {}