def draw_cloud(dataframe, column):
    
    # Join the different processed titles together.
    long_string = ','.join(list(dataframe[column]))
    # Create a WordCloud object
    wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=6, contour_color='steelblue')
    # Generate a word cloud
    wordcloud.generate(long_string)
    # Visualize the word cloud
    return wordcloud.to_image()
예제 #2
0
def create_word_cloud(x="result.png"):
    f = open("word_repeat_word_cloud", encoding="utf8")
    text = f.read()

    stopwords = add_stop_words(['نیست'])
    stopwords = add_stop_words(['هست'])
    stopwords = add_stop_words(['می‌کنیم'])
    stopwords = add_stop_words(['کردند'])
    stopwords = add_stop_words(['کنید'])
    stopwords = add_stop_words(['می‌کنند'])
    stopwords = add_stop_words(['کردم'])
    stopwords = add_stop_words(['کردیم'])
    stopwords = add_stop_words(['داریم'])
    stopwords = add_stop_words(['کرده'])
    stopwords = add_stop_words(['کرد'])
    stopwords = add_stop_words(['می‌کند'])
    stopwords = add_stop_words(['می‌کنم'])
    stopwords = add_stop_words(['هستیم'])
    stopwords = add_stop_words(['کردید'])
    stopwords = add_stop_words(['کنیم'])
    stopwords = add_stop_words(['کنند'])
    stopwords = add_stop_words(['باشیم'])
    stopwords = add_stop_words(['کند'])
    stopwords = add_stop_words(['کند'])
    stopwords = add_stop_words(['می‌شود'])
    stopwords = add_stop_words(['می‌شویم'])
    stopwords = add_stop_words(['می‌شوید'])
    stopwords = add_stop_words(['اینها'])
    # Generate a word cloud image
    wordcloud = PersianWordCloud(only_persian=True,
                                 max_words=300,
                                 margin=0,
                                 width=1000,
                                 height=1000,
                                 min_font_size=1,
                                 collocations=False,
                                 max_font_size=500,
                                 stopwords=stopwords,
                                 background_color="black").generate(text)
    # Display the generated image:
    image = wordcloud.to_image()
    image.show()
    image.save(x)
    f.close()
예제 #3
0
def genwordcloud(fulltext):
    try:
        wordcloud = WordCloud(max_font_size=60,
                              max_words=30,
                              background_color="white",
                              collocations=False).generate(fulltext)
        #        wordcloud.recolor (color_func=color_func, random_state=3)
        image = wordcloud.to_image()
        output = io.BytesIO()
        image.save(output, format="PNG")
        print(hostname,
              now(),
              '/wordcloud: Generated wordcloud image.',
              file=sys.stderr)
        return output.getvalue()
    except:
        print(hostname,
              now(),
              '/wordcloud: Error generating wordcloud image.',
              file=sys.stderr)
        return None
예제 #4
0
# Import the wordcloud library
import wordcloud

# Join the different processed titles together.
s = " "
long_string = s.join(papers['title_processed'])
# print (long_string)

# Create a WordCloud object
wordcloud = wordcloud.WordCloud()

# Generate a word cloud
wordcloud.generate(long_string)

# Visualize the word cloud
wordcloud.to_image()
import matplotlib.pyplot as plt

plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.show()

# ## 6.  Prepare the text for LDA analysis
#The main text analysis method that we will use is latent Dirichlet allocation (LDA).
# LDA is able to perform topic detection on large document sets, determining what the main 'topics' are in a large unlabeled set of texts.
# A 'topic' is a collection of words that tend to co-occur often.
# The hypothesis is that LDA might be able to clarify what the different topics in the research titles are.
# These topics can then be used as a starting point for further analysis.</p>
#LDA does not work directly on text data. First, it is necessary to convert the documents into a simple vector representation.
# This representation will then be used by LDA to determine the topics.
# Each entry of a 'document vector' will correspond with the number of times a word occurred in the document.