def plot_term_frequency_no_stop(subset):
    ##############################################
    # Visualize stopwords removal
    ##############################################
    vectorizer = CountVectorizer(stop_words='english')
    docs = vectorizer.fit_transform(corpus.title_words(fileids=subset))
    features = vectorizer.get_feature_names()

    visualizer = FreqDistVisualizer(features)
    visualizer.fit(docs)
    visualizer.poof()
def plot_term_frequency(subset):
    ##############################################
    # Visualize frequency distribution of top 50 tokens
    ##############################################
    vectorizer = CountVectorizer()
    docs = vectorizer.fit_transform(corpus.title_words(fileids=subset))
    features = vectorizer.get_feature_names()

    visualizer = FreqDistVisualizer(features)
    visualizer.fit(docs)
    visualizer.poof()
예제 #3
0
    def hist_tokens_texts(cls, texts_dir, vectorizer, ext='txt'):
        reader = CorpusReader(input_folder_name=texts_dir,
                              doc_pattern=r'(.*?/).*\.' + ext,
                              categ_pattern=r'(.*?)/.*\.' + ext,
                              encoding='utf-8')
        texts = list(reader.readfiles(fileids=reader.root_ids))

        docs = vectorizer.fit_transform(texts)

        features = vectorizer.get_feature_names()

        visualizer = FreqDistVisualizer(features=features, size=(1080, 720))
        visualizer.fit(docs)
        visualizer.show()
예제 #4
0
def freqdist(docs, outpath, corpus_kwargs={}, **kwargs):
    # Create a new figure and axes
    fig = plt.figure()
    ax = fig.add_subplot(111)

    # Vectorize the corpus
    vectorizer = CountVectorizer(**corpus_kwargs)
    docs = vectorizer.fit_transform(docs)
    features = vectorizer.get_feature_names()

    # Visualize the frequency distribution
    visualizer = FreqDistVisualizer(features=features, **kwargs)
    visualizer.fit(docs)
    visualizer.poof(outpath=outpath)
예제 #5
0
파일: freqdist.py 프로젝트: TinaCloud/atap
    # Return the data bunch for use similar to the newsgroups example
    return Bunch(
        categories=categories,
        files=files,
        data=data,
        target=target,
    )


# Visualize frequency distribution of top 50 tokens
vectorizer = CountVectorizer()
docs = vectorizer.fit_transform(corpus.data)
features = vectorizer.get_feature_names()

visualizer = FreqDistVisualizer()
visualizer.fit(docs, features)
visualizer.poof()

# Visualize stopwords removal
vectorizer = CountVectorizer(stop_words='english')
docs = vectorizer.fit_transform(corpus.data)
features = vectorizer.get_feature_names()

visualizer = FreqDistVisualizer()
visualizer.fit(docs, features)
visualizer.poof()

# Visualize different subcorpora
hobby_types = {}
예제 #6
0
파일: freqdist.py 프로젝트: yokeyong/atap
    return Bunch(
        categories=categories,
        files=files,
        data=data,
        target=target,
    )


corpus = load_corpus('hobbies')

# Visualize frequency distribution of top 50 tokens
vectorizer = CountVectorizer()
docs = vectorizer.fit_transform(corpus.data)
features = vectorizer.get_feature_names()

visualizer = FreqDistVisualizer(features)
visualizer.fit(docs)
visualizer.poof()

# Visualize stopwords removal
vectorizer = CountVectorizer(stop_words='english')
docs = vectorizer.fit_transform(corpus.data)
features = vectorizer.get_feature_names()

visualizer = FreqDistVisualizer(features)
visualizer.fit(docs)
visualizer.poof()

# Visualize different subcorpora
hobby_types = {}