Exemplo n.º 1
0
class YaseeStopWords():
    DEFAULT_STOPWORDS = STOPWORDS.union(UCIWC_DEFAULTSTOPWORDS)

    def __init__(self, stopwords:frozenset=None, replace:bool=False):
        if (stopwords == None):
            self.stopwords = set(YaseeStopWords.DEFAULT_STOPWORDS)
        else:
            if replace:
                self.stopwords = set(stopwords.__iter__())
            else:

                self.stopwords = set(YaseeStopWords.DEFAULT_STOPWORDS)
                for x in stopwords:
                    self.stopwords.add(x)

    def getStopwords(self) -> frozenset:
        return frozenset(self.stopwords.__iter__())

    def addStopwords(self, item: str or iter=None):
        if item == None:
            return
        elif type(item) == str:
            self.stopwords.add(item)
        else:
            for x in item:
                self.stopwords.add(x)

    def __contains__(self, item):
        return item in self.stopwords

    def __iter__(self):
        return self.stopwords.__iter__()
Exemplo n.º 2
0
def plotWordCloud(results_list):
    actions_str = ""
    keywords_str = ""
    for result in results_list:
        if (len(result["actions"]) > 0):
            actions_str += ' '.join(result["actions"]) + " "
        if (len(result["keywords"]) > 0):
            keywords_str += ' '.join(result["keywords"]) + " "
    my_stopwords = {
        "try", "keep", "use", "want", "need", "know", "give", "help", "tell",
        "might", "cant", "say", "cause"
        "place"
    }

    wordcloud_actions = WordCloud(
        stopwords=STOPWORDS.union(my_stopwords)).generate(actions_str)
    wordcloud_keywords = WordCloud().generate(keywords_str)
    fig, axs = plt.subplots(1, 2, figsize=(20, 10))
    axs[0].imshow(wordcloud_actions)
    axs[0].set_title("Actions", fontsize=20)
    axs[0].axis("off")
    axs[1].imshow(wordcloud_keywords)
    axs[1].set_title("Keywords", fontsize=20)
    axs[1].axis("off")
    plt.show()
Exemplo n.º 3
0
def word_cloud(data,
               my_stopwords,
               background_image,
               use_col='comment',
               max_fontsize=70,
               save_path='wordcloud.png'):
    """
    :return: 词云图
    """

    background_image = plt.imread(background_image)
    wc = WordCloud(
        background_color='white',  # 设置背景颜色
        mask=background_image,  # 设置遮罩图片,控制的是词云的形状,比如说松鼠形状的词云,云朵形状的等等,图清晰点比较好
        max_words=55,  # 设置最大现实的字数
        collocations=False,
        stopwords=STOPWORDS.union(set(my_stopwords)),  # 设置停用词
        font_path='MSYH.TTF',  # 设置字体格式,如不设置显示不了中文
        max_font_size=max_fontsize,  # 设置字体最大值,会自动根据图片大写调整,不同图的60看起来不一样
        # min_font_size=2,    # 设的比较大的话,小的就不显示了
        # random_state = 1800,  # 设置有多少种随机生成状态,即有多少种布局方案,横的竖的分布
        scale=20  # 越大计算越慢,图的大小,不如让底图大点清晰点来得快
    )

    text = ','.join(map(str, data[use_col]))
    wc.generate(text)
    image_colors = ImageColorGenerator(background_image)
    wc.recolor(color_func=image_colors)
    plt.imshow(wc)
    plt.axis('off')
    plt.savefig(save_path)
Exemplo n.º 4
0
def main():
    options, args = cmdparameter(sys.argv)
    #-----------------------------------
    file = options.filein
    mask_pic = options.mask_pic
    if not mask_pic:
        mask_pic = None
    stopwords = options.stopwords
    maxwords = options.maxwords
    font = options.font
    output = options.output
    max_font_size = options.max_font_size
    if not output:
        output = file + '.png'
    verbose = options.verbose
    global debug
    debug = options.debug
    global STOPWORDS
    if stopwords:
        STOPWORDS = STOPWORDS.union(
            set([line.strip() for line in open(stopwords)]))
    print(STOPWORDS)
    #-----------------------------------
    draw_wordcloud(txt=file,
                   output=output,
                   font=font,
                   max_font_size=max_font_size,
                   max_words=maxwords,
                   mask_pic=mask_pic,
                   stopwords=STOPWORDS)
Exemplo n.º 5
0
Arquivo: util.py Projeto: LoLei/ircbot
def get_stopwords() -> Set[str]:

    stopwords: Set[str] = set()

    # stop words from sklearn
    stopwords = stopwords.union(text.ENGLISH_STOP_WORDS)

    # stop words from wordcloud
    stopwords = WCSTOPWORDS.union(stopwords)

    # custom stopwords
    # from config and/or bot commands
    user_stopwords = CONFIG['stopwords'].split(',')
    stopwords.update(user_stopwords)

    # Adapt for how wordcloud and sklearn CountVectorizer handle stop words
    # Satisfy both
    preprocessed_stopwords = []
    for sw in stopwords:
        if '\'' not in sw:
            continue
        parts = sw.split('\'')
        preprocessed_stopwords.append(parts[0])
        preprocessed_stopwords.append(parts[1])

    stopwords.update(preprocessed_stopwords)

    return stopwords
    def config_stopwords(self, more_stopwords=None):
        """
          (obj) -> None

          Configuring stopwords by adding more if required
        """

        if more_stopwords is not None:
            self.STOPWORDS = STOPWORDS.union(more_stopwords)
Exemplo n.º 7
0
    def config_stopwords(self, more_stopwords=None):
        """
          (obj) -> None

          Configuring stopwords by adding more if required
        """

        if more_stopwords is not None:
            self.STOPWORDS = STOPWORDS.union(more_stopwords)
def title_wordcloud(dataFrame):
    from wordcloud import WordCloud, STOPWORDS
    from PIL import Image
    #WordCloud Visualization
    text = " ".join(list(dataFrame['track_name']))
    STOPWORDS = STOPWORDS.union(["feat","Remix","Edit","Radio","Version","Mix","Remastered"])
    spotify_mask = np.array(Image.open(path.join( "spotify-logo.jpg")))
    wordcloud = WordCloud(width=2880, height=1800,background_color="white",
                          stopwords=STOPWORDS,mask = spotify_mask).generate(text)
    # Open a plot of the generated image.
    plt.figure( figsize=(10,6))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.savefig("project3_wordcloud.png")
    plt.show()
Exemplo n.º 9
0
def plot_word2(text):
    wordcloud = WordCloud(
        stopwords=STOPWORDS.union(set(stwlist)),
        max_words=200,
        max_font_size=120,
        font_path="simsun.ttf",
        random_state=0,
    ).generate(text)
    # Display the generated image:
    fig, ax = plt.subplots()
    # plt.imshow(wordcloud, interpolation='bilinear')
    # plt.axis("off")
    # plt.show()
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis("off")
    # ax.show()
    st.pyplot(fig)
def get_polarity_and_wordcloud(tweets, NoofTweets):
    """A function to calculate sentiments and to build wordcloud"""
    positive = 0
    negative = 0
    neutral = 0
    polarity = 0
    print()
    print('THE TWEETS ARE:')
    print()
    for tweet in tweets:
        #print(tweet.text)
        tweet_tokenisation = (' '.join(
            re.sub(
                "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^RT | co | ",
                " ", str(tweet.text)).split()
        ))  #returns a list--.join converts it into a string
        print(tweet.created_at)
        print(tweet_tokenisation)
        sentiment_analyser = TextBlob(tweet.text)
        polarity += sentiment_analyser.sentiment.polarity
        if sentiment_analyser.sentiment.polarity == 0:
            neutral += 1
        if sentiment_analyser.sentiment.polarity < 0:
            negative += 1
        if sentiment_analyser.sentiment.polarity > 0:
            positive += 1
    Positive = convert_percentage(positive, NoofTweets)
    Negative = convert_percentage(negative, NoofTweets)
    Neutral = convert_percentage(neutral, NoofTweets)
    print()
    print("WORDCLOUD:")
    print()
    more_stopwords = {'oh', 'will', 'hey', 'yet',
                      'RT'}  # Adding stopwords as a part of text preprocessing
    STOPWORDS_MOD = STOPWORDS.union(more_stopwords)
    cloud = WordCloud(width=1800,
                      height=1400,
                      background_color='black',
                      stopwords=STOPWORDS_MOD).generate(tweet_tokenisation)
    plt.imshow(cloud)
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.show()
    return Positive, Negative, Neutral
Exemplo n.º 11
0
def title_wordcloud(dataFrame):
    from wordcloud import WordCloud, STOPWORDS
    from PIL import Image
    #WordCloud Visualization
    text = " ".join(list(dataFrame['track_name']))
    STOPWORDS = STOPWORDS.union(
        ["feat", "Remix", "Edit", "Radio", "Version", "Mix", "Remastered"])
    spotify_mask = np.array(Image.open(path.join("spotify-logo.jpg")))
    wordcloud = WordCloud(width=2880,
                          height=1800,
                          background_color="white",
                          stopwords=STOPWORDS,
                          mask=spotify_mask).generate(text)
    # Open a plot of the generated image.
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.savefig("project3_wordcloud.png")
    plt.show()
Exemplo n.º 12
0
def main():
    options, args = cmdparameter(sys.argv)
    #-----------------------------------
    file = options.filein
    mask_pic = options.mask_pic
    if not mask_pic:
        mask_pic = None
    stopwords = options.stopwords
    maxwords = options.maxwords
    font = options.font
    output = options.output
    max_font_size = options.max_font_size
    if not output:
        output = file + '.png'
    verbose = options.verbose
    global debug
    debug = options.debug
    global STOPWORDS
    if stopwords:
        STOPWORDS = STOPWORDS.union(
            set([line.strip() for line in open(stopwords)]))
    print >> sys.stderr, STOPWORDS
    #-----------------------------------
    draw_wordcloud(txt=file,
                   output=output,
                   font=font,
                   max_font_size=max_font_size,
                   max_words=maxwords,
                   mask_pic=mask_pic,
                   stopwords=STOPWORDS)
    #-----------end close fh-----------
    ###--------multi-process------------------
    #pool = ThreadPool(5) # 5 represents thread_num
    #result = pool.map(func, iterable_object)
    #pool.close()
    #pool.join()
    ###--------multi-process------------------
    if verbose:
        print >>sys.stderr,\
            "--Successful %s" % strftime(timeformat, localtime())
Exemplo n.º 13
0
def plot_cloud(text):
    stopwords = STOPWORDS.union(common_words)
    wordcloud = WordCloud(
        background_color="white",
        width=2400,
        height=1200,
        stopwords=stopwords,
        max_words=300).generate(
            text)  #.recolor(color_func=grey_color_func, random_state=3)

    # Open a plot of the generated image.
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    # import IPython; IPython.embed()

    fig = plt.gcf()
    fig.set_size_inches(18.5, 10.5)
    canvas = FigureCanvas(fig)
    png_output = BytesIO()
    canvas.print_png(png_output)

    return png_output.getvalue()
Exemplo n.º 14
0
def make_wordcloud(text,
                   outfile,
                   custom_sw=None,
                   reduction=None,
                   figure_size=(20, 10),
                   display=False):
    """    Generate a square wordcloud.
    
    """

    # Ensure output directory exists. If not, create.
    directory = '/'.join(outfile.split(
        '/')[:-1])  # split by '/'; remove last element (file); join by '/'
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Join default and custom stopwords
    if custom_sw is not None:
        sw = STOPWORDS.union(custom_sw)

    # Reduce
    if reduction is not None:
        text = replace_strings(text, reduction)

    # Create wordcloud
    wordcloud = WordCloud(max_font_size=60,
                          stopwords=sw,
                          background_color='black').generate(text)

    plt.figure(figsize=figure_size)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.savefig(outfile, bbox_inches='tight')

    if display:
        plt.show()

    pass
Exemplo n.º 15
0
from pathlib import Path
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from clean import CleanText
nike_tweets = pd.read_csv(
    Path(__file__).absolute().parent.joinpath(
        '../dataset/5000-justdoit-tweets-dataset/justdoit_tweets_2018_09_07_2.csv'
    ))
nike_tweets = nike_tweets[['tweet_full_text']]
exclude = stopwords.words('english').append('https')
tweet_string = []
cleaner = CleanText()
words_to_exclude = {'https'}

for t in nike_tweets.tweet_full_text:

    tweet_string.append(t)
tweet_string = pd.Series(tweet_string).str.cat(sep=' ')
whitelist = ["n't", "not", "no"]
print(tweet_string)
print(stopwords.words('english'))
wc = WordCloud(width=1600,
               height=800,
               max_font_size=200,
               ranks_only="frequency",
               stopwords=STOPWORDS.union(words_to_exclude),
               collocations=False).generate(tweet_string)
plt.figure(figsize=(12, 10))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()
Exemplo n.º 16
0
# Twitter API docs:
# https://dev.twitter.com/docs/api/1/get/search
#-----------------------------------------------------------------------
query = twitter.search.tweets(q = "modi", count=5000) #, until='2016-01-07')

#-----------------------------------------------------------------------
# How long did this query take?
#-----------------------------------------------------------------------
print ("Search complete (%.3f seconds)" % (query["search_metadata"]["completed_in"]))

#-----------------------------------------------------------------------
# Loop through each of the results, and print its content.
#-----------------------------------------------------------------------
#for result in query["statuses"]:
#	print ("(%s) @%s %s" % (result["created_at"], result["user"]["screen_name"], result["text"]))

# make a corpus from the list of tweets
status_list = [ result['text'] for result in query['statuses']]
corpus = ' '.join(status_list)

# read in the image and colors and plot the word cloud
img = Image.open("modi.jpg")
#img = img.resize((980,1080), Image.ANTIALIAS)
modi_coloring = np.array(img)
image_colors = ImageColorGenerator(modi_coloring)
#hcmask = scipy.ndimage.zoom(hcmask, 2, order=3)
STOPWORDS = STOPWORDS.union({"http","https","t","co","rt","since","towards","now","ok","okay","tag", "amp"})
#wc = WordCloud(background_color="white", max_words=2000, mask=hcmask, stopwords=STOPWORDS)
wc = WordCloud(font_path='cabin-sketch.bold.ttf', background_color="white", max_words=2000, mask=modi_coloring, color_func=image_colors, stopwords=STOPWORDS)
wc.generate(corpus)
wc.to_file("wc_color.png")
import pandas as pd
import io
import os

texts = {}
df = pd.read_csv("train_set.csv", sep="\t")

categories = ['Business', 'Politics', 'Film', 'Football', 'Technology']
my_stop_words = [
    'will', 'one', 'two', 'four', 'new', 'now', 'day', 'year', 'month', 'week',
    'ago', 'late', 'little', 'many', 'said', 'last', 'time', 'first', 'second',
    'make', 'say', 'saying', 'may', 'maybe', 'long', 'short', 'use', 'says',
    'old', 'made', 'today', 'back', 'face', 'believe', 'around', 'become',
    'th', 'high'
]
stop_words = STOPWORDS.union(my_stop_words)

if not os.path.exists("Images"):
    os.makedirs("Images")

for i in categories:
    texts[i] = df.ix[df['Category'] == i]['Content']
    texts[i] = texts[i].to_string(header=False)
    texts[i] = texts[i].replace('\n', ' ').replace('\r', '')

    wordcloud = WordCloud(max_font_size=50,
                          min_font_size=2,
                          max_words=500,
                          stopwords=stop_words,
                          background_color="white",
                          relative_scaling=.4).generate(texts[i])
Exemplo n.º 18
0
#
# 따라하며 배우는 파이썬과 데이터과학(생능출판사 2020)
# 9.7 정보를 한눈에 보여주는 워드 클라우드 , 232쪽
#
from wordcloud import WordCloud, STOPWORDS
# 중지어가 제외된 워드 클라우드를 만들자
s_words = STOPWORDS.union({'one', 'using', 'first', 'two', 'make', 'use'})
wordcloud = WordCloud(width=2000, height=1500,
                      stopwords=s_words).generate(text)
Exemplo n.º 19
0
                fontsize=14)
axes.set_xlabel("User ", fontsize=14)
fig.tight_layout()
plt.savefig("data/tweet_frequency_user_wise.jpg")

## 3. Get follower count for users
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d %H:%M:%S')
user_data = pd.read_csv(
    './data/tweet_users.csv', sep="\t"
)  #, parse_dates=['created_at'], date_parser=dateparse, dtype={'hashtags':str})
#print("Read the data. Its columns are:\n   " , re.sub("[ ]+",":", str(user_data.dtypes).replace("\n", ",\t") ) )
fig, axes = plt.subplots(figsize=(15, 5))
user_data.set_index('name')[['followers_count']].plot(ax=axes, kind='bar')
##plt.setp(axes[0].get_xticklabels(), visible=False)
plt.title("Visualizing Tweet frequency: Year wise, and User-wise", fontsize=16)
axes.set_xlabel("Twitter user name", fontsize=16)
fig.tight_layout()
plt.savefig("data/followers.jpg")

##4 . A word cloud to see the important words at play here
from wordcloud import WordCloud, STOPWORDS
text = "\n".join([ft for ft in data.fulltext])
wordcloud = WordCloud(relative_scaling=1.0,
                      stopwords=STOPWORDS.union(["https", "co",
                                                 "rt"])).generate(text)
fig, axes = plt.subplots(figsize=(15, 6))
plt.imshow(wordcloud)
plt.axis("off"),
plt.savefig("data/wordcloud.jpg")
#plt.show()\n",
Exemplo n.º 20
0
for i in range(0, A.shape[0]):

    if A[i][4] == "Politics":
        text_p += " " + A[i][3]

    elif A[i][4] == "Film":
        text_f += " " + A[i][3]

    elif A[i][4] == "Football":
        text_ft += " " + A[i][3]

    elif A[i][4] == "Technology":
        text_t += " " + A[i][3]

    elif A[i][4] == "Business":
        text_b += " " + A[i][3]

my_additional_stop_words = STOPWORDS.union([
    'people', 'said', 'did', 'say', 'says', 'year', 'day', 'just', 'good',
    'come', 'make', 'going', 'having', 'like', 'need', 'given', 'got'
])

stopwords = ENGLISH_STOP_WORDS.union(my_additional_stop_words)

create_WordCloud(text_p, stopwords, "Politics")
create_WordCloud(text_f, stopwords, "Films")
create_WordCloud(text_ft, stopwords, "Football")
create_WordCloud(text_t, stopwords, "Technology")
create_WordCloud(text_b, stopwords, "Business")
event_ = PSTAT.event_

# Construct corpus: to lower case, strip numeric
corpus = {}
events = [26, 27] #[16, 83]
for event in events:
    docs = keydev['events'].find(
        {'keydeveventtypeid': {'$eq': event}}, {'_id': 0})
    corpus[event] = [re.sub(r'\b\w*[\d]\w*\b', ' ', " ".join(
            d[k] for k in ['headline', 'situation'])).lower() for d in docs]
DataFrame({'description': [event_[event] for event in corpus.keys()],
           'count': [len(lines) for lines in corpus.values()]},
          index=corpus.keys())

# Tokenize, and remove stopwords
stop_words = STOPWORDS.union(['co', 'ltd', 'mr', 'mrs', 'inc', 'llc'])
for event, lines in corpus.items():
    corpus[event] = [[w for w in re.findall(r"\w\w+", line)
                      if w not in stop_words] for line in lines]
    
# Split shuffled into labelled training and test sets
train_data = []
test_data = []
split_frac = 0.9
for label, (event, lines) in enumerate(corpus.items()):
    np.random.shuffle(lines)
    n = int(split_frac * len(lines))   # split point of train and test sets
    train_data.extend([(label, corpus[event][p]) for p in range(n)])
    test_data.extend([(label, corpus[event][p]) for p in range(n, len(lines))])
N = len(train_data)
print('train/test:', N, [np.mean([label for label,_ in subset])
Exemplo n.º 22
0
Arquivo: P05.py Projeto: huni-KR/PKNU
from wordcloud import WordCloud, STOPWORDS
import wikipediaapi
import matplotlib.pyplot as plt

wiki = wikipediaapi.Wikipedia('en')
page = wiki.page('UNESCO')

STOPWORDS.union('work', 'literature', 'call',
                'October', 'State', 'de', 'General')

wordcloud = WordCloud(font_path='font/NanumGothic.ttf',
                      stopwords=STOPWORDS, width=2000, height=2000).generate(page.summary)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud)
plt.show()
Exemplo n.º 23
0
import scipy

tweet_file = open(username + "_tweets_file.bin",'rb')
tweets = pickle.load(tweet_file)

def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)

words = ' '
for tweet in tweets:
  words += tweet.text

stopwords = {'https',"co","RT"}

wordcloud = WordCloud(
    stopwords=STOPWORDS.union(stopwords),
    background_color='black',
    max_words=500,
    width=7000,
    height=7000
).generate(words)

plt.imshow(wordcloud.recolor(color_func=grey_color_func, random_state=3))
plt.axis('off')
plt.savefig('./tweetcloud2.png', dpi=300)
plt.show()

"""Top Hashtags"""

hashtags_dict = {}
for tweet in tweets:
Exemplo n.º 24
0
def index():

    if request.method == 'POST':
        hashtag_name = request.form['hashtag']
        number = request.form['number']
        splitted_hashtags = [ht.strip() for ht in re.split(", ", hashtag_name)]
        if check_if_hashtags_are_valid(splitted_hashtags):
            results = []
            for tweet in tweepy.Cursor(api.search,
                                       q=splitted_hashtags,
                                       lang="en").items(int(number)):
                results.append(tweet)
            data_set = tweets_df(results)

            text = data_set["text"]
            for i in range(0, len(text)):
                txt = ' '.join(word for word in text[i].split()
                               if not word.startswith('https:'))
                data_set.at[i, 'text2'] = txt
                data_set.drop_duplicates('text2', inplace=True)
                data_set.reset_index(drop=True, inplace=True)
                data_set.drop('text', axis=1, inplace=True)
                data_set.rename(columns={'text2': 'text'}, inplace=True)

            # Join all the text from the 1000 tweets
            text_Combined = " ".join(text.values.astype(str))
            more_stopwords = {
                'https', 'RT', 'rt', 'CO', '@', 'el', 't', '&amp;', 'covid',
                'covid 19', hashtag_name, hashtag_name[1:], '#covid19', 'tco',
                'covid19', 'amp', '@drericding'
            }
            stopwords = STOPWORDS.union(more_stopwords)
            covid = " ".join([word for word in text_Combined.split()])
            wordcount = {}

            # To eliminate duplicates, remember to split by punctuation, and use case demiliters.
            for word in covid.lower().split():
                word = word.replace(".", "")
                word = word.replace(",", "")
                word = word.replace(":", "")
                word = word.replace("\"", "")
                word = word.replace("!", "")
                word = word.replace("“", "")
                word = word.replace("‘", "")
                word = word.replace("*", "")
                if word not in stopwords:
                    if word not in wordcount:
                        wordcount[word] = 1
                    else:
                        wordcount[word] += 1

            word_counter = collections.Counter(wordcount)

            # Create a data frame of the most common words
            lst = word_counter.most_common(100)
            df = pd.DataFrame(lst, columns=['Word', 'Count'])
            text1 = df["Word"]
            text_Combined = " ".join(text1.values.astype(str))
            covid = " ".join([word for word in text_Combined.split()])

            #Create a Word Cloud
            wc = WordCloud(background_color="White",
                           stopwords=STOPWORDS.union(more_stopwords),
                           width=600,
                           height=300,
                           relative_scaling=0,
                           max_words=50)
            wc.generate(covid)
            wc.to_file('static/temporary_files/fig100.png')
            full_filename = os.path.join(app.config['UPLOAD_FOLDER'],
                                         'fig100.png')

            return render_template("search.html", image=full_filename)
        else:
            return render_template("index.html")
    else:
        return render_template("index.html")
Exemplo n.º 25
0
def load_file2list(filename):
    with open(filename, 'r') as fn:
        flines = fn.readlines()
        datam = []
        for line in flines:

            temp = line.strip('\n').split(' ')[0]
            datam.append(temp)
    return datam


if __name__ == '__main__':

    text = open(args.text).read()
    font_path = args.font_path

    jieba_result = jieba_processing_txt(text)
    mask_list = load_file2list(args.stop_words)

    mask_word = STOPWORDS.union(mask_list)

    wc = WordCloud(font_path=font_path,
                   background_color="white",
                   max_words=400,
                   max_font_size=400,
                   width=2000,
                   height=1000,
                   stopwords=mask_word)
    wc.generate(jieba_result)
    wc.to_file(args.output_image)
Exemplo n.º 26
0
def grey_color_func(word,
                    font_size,
                    position,
                    orientation,
                    random_state=None,
                    **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)


words = ' '
for tweet in tweets:
    words += tweet.text

stopwords = {'https', "co", "RT"}

wordcloud = WordCloud(stopwords=STOPWORDS.union(stopwords),
                      background_color='black',
                      max_words=500,
                      width=7000,
                      height=7000).generate(words)

plt.imshow(wordcloud.recolor(color_func=grey_color_func, random_state=3))
plt.axis('off')
plt.savefig('./tweetcloud2.png', dpi=300)
plt.show()
"""Top Hashtags"""

hashtags_dict = {}
for tweet in tweets:
    hashtags = tweet.entities.get('hashtags')
    for hashtag in hashtags:
from wordcloud import WordCloud
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt
import re
filename="/Users/chunmeiGao/Documents/Dataincubator/emailsubject.txt"

# Read the whole text.
text = open(filename).read()
print text
text=re.sub('Re:', '', text)
text=re.sub('RE:', '', text)
text=re.sub('FW:', '', text)
text=re.sub('Fwd:', '', text)
text=re.sub('Enron', '', text)
more_stopwords = {'X', 'Re', 'Fwd','ENRON','NA','FW'}
STOPWORDS = STOPWORDS.union(more_stopwords)
# Generate a word cloud image
wordcloud = WordCloud(stopwords=STOPWORDS).generate(text)

# Display the generated image:
# the matplotlib way:

plt.imshow(wordcloud)
plt.axis("off")

# take relative word frequencies into account, lower max_font_size
wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
Exemplo n.º 28
0
def plotly_wordcloud(text):
    colors = ["#000000", "#111111", "#101010", "#121212", "#212121", "#222222"]
    cmap = LinearSegmentedColormap.from_list("mycmap", colors)
    wc = WordCloud(
        stopwords=set(STOPWORDS.union(set(stwlist))),
        max_words=300,
        max_font_size=120,
        colormap=cmap,
        random_state=0,
    )
    wc.generate(text)

    word_list = []
    freq_list = []
    fontsize_list = []
    position_list = []
    orientation_list = []
    color_list = []

    for (word, freq), fontsize, position, orientation, color in wc.layout_:
        word_list.append(word)
        freq_list.append(freq)
        fontsize_list.append(fontsize)
        position_list.append(position)
        orientation_list.append(orientation)
        color_list.append(color)

    # get the positions
    x = []
    y = []
    for i in position_list:
        x.append(i[0])
        y.append(i[1])

    # get the relative occurence frequencies
    new_freq_list = []
    for i in freq_list:
        new_freq_list.append((i * 150 + 8))
    # new_freq_list

    trace = go.Scatter(x=x,
                       y=y,
                       textfont=dict(size=new_freq_list, color=color_list),
                       hoverinfo='text',
                       hovertext=[
                           '{0}{1}'.format(w, f)
                           for w, f in zip(word_list, freq_list)
                       ],
                       mode='text',
                       text=word_list)

    layout = go.Layout({
        'xaxis': {
            'showgrid': False,
            'showticklabels': False,
            'zeroline': False
        },
        'yaxis': {
            'showgrid': False,
            'showticklabels': False,
            'zeroline': False
        }
    })

    fig = go.Figure(data=[trace], layout=layout)
    fig.update_layout(plot_bgcolor='#D3DFE2')
    return fig
Exemplo n.º 29
0
 def stop_words_configs(self):
   """
     Configuring stopwords by adding more if required
   """
   more_stopwords = {'innojam', 'video', 'cebit2014'}
   self.STOPWORDS = STOPWORDS.union(more_stopwords)
import numpy as np
import pandas as pd
import collections
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

other_stopwords_to_remove = ['abracadabra', 'etc']
STOPWORDS = STOPWORDS.union(set(other_stopwords_to_remove))
stopwords = set(STOPWORDS)

data=pd.read_csv("file.csv")
text=data[data['Name'] == 'DDT']
text=data["comments"]
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                max_words=2000,
                stopwords = stopwords, 
                min_font_size = 10).generate(str(text))

#Arguments of WordCloud
#['self', 'font_path', 'width', 'height', 'margin', 'ranks_only', 'prefer_horizontal', 'mask', 'scale', 'color_func', 'max_words', 'min_font_size', 'stopwords', 'random_state', 'background_color', 'max_font_size', 'font_step', 'mode', 'relative_scaling', 'regexp', 'collocations', 'colormap', 'normalize_plurals']


plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
Exemplo n.º 31
0
def get_stopwords():
    # Create stopword list:
    return STOPWORDS.union(
        set([
            "a", "actualmente", "adelante", "además", "afirmó", "agregó",
            "ahí", "ahora", "cc", "this", "pa", "a", "b", "c", "d", "e", "f",
            "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",
            "t", "u", "v", "w", "x", "y", "z", "al", "algo", "algún", "algún",
            "alguna", "algunas", "alguno", "algunos", "alrededor", "ambos",
            "ampleamos", "añadió", "ante", "anterior", "antes", "apenas",
            "aproximadamente", "aquel", "aquellas", "aquellos", "aqui", "aquí",
            "arriba", "aseguró", "así", "atras", "aún", "aunque", "ayer",
            "bajo", "bastante", "bien", "buen", "buena", "buenas", "bueno",
            "buenos", "cada", "casi", "cerca", "cierta", "ciertas", "cierto",
            "ciertos", "cinco", "comentó", "como", "cómo", "con", "conocer",
            "conseguimos", "conseguir", "considera", "consideró", "consigo",
            "consigue", "consiguen", "consigues", "contra", "cosas", "creo",
            "cual", "cuales", "cualquier", "cuando", "cuanto", "cuatro",
            "cuenta", "da", "dado", "dan", "dar", "de", "debe", "deben",
            "debido", "decir", "dejó", "del", "demás", "dentro", "desde",
            "después", "dice", "dicen", "dicho", "dieron", "diferente",
            "diferentes", "dijeron", "dijo", "dio", "donde", "dos", "durante",
            "e", "ejemplo", "el", "de", "la", "el", "porfas", "t", "p", "d",
            "est", "él", "ella", "ellas", "ello", "ellos", "embargo",
            "empleais", "emplean", "emplear", "empleas", "empleo", "en",
            "encima", "encuentra", "entonces", "entre", "era", "eramos",
            "eran", "eras", "eres", "es", "esa", "esas", "ese", "eso", "esos",
            "esta", "ésta", "está", "estaba", "estaban", "estado", "estais",
            "estamos", "estan", "están", "estar", "estará", "estas", "éstas",
            "este", "éste", "esto", "estos", "éstos", "estoy", "estuvo", "ex",
            "existe", "existen", "explicó", "expresó", "fin", "fue", "fuera",
            "fueron", "fui", "fuimos", "gracias", "gran", "grandes", "gueno",
            "ha", "haber", "había", "habían", "habrá", "hace", "haceis",
            "hacemos", "hacen", "hacer", "hacerlo", "haces", "hacia",
            "haciendo", "hago", "han", "hasta", "hay", "haya", "he", "hecho",
            "hemos", "hicieron", "hizo", "hoy", "hubo", "igual", "incluso",
            "indicó", "informó", "intenta", "intentais", "intentamos",
            "intentan", "intentar", "intentas", "intento", "ir", "junto", "la",
            "lado", "largo", "las", "le", "les", "llegó", "lleva", "llevar",
            "lo", "los", "luego", "lugar", "manera", "manifestó", "más",
            "mayor", "me", "mediante", "mejor", "mencionó", "menos", "mi",
            "mientras", "mio", "misma", "mismas", "mismo", "mismos", "modo",
            "momento", "mucha", "muchas", "mucho", "muchos", "muy", "nada",
            "nadie", "ni", "ningún", "ninguna", "ningunas", "ninguno",
            "ningunos", "no", "nos", "nosotras", "nosotros", "nuestra",
            "nuestras", "nuestro", "nuestros", "nueva", "nuevas", "nuevo",
            "nuevos", "nunca", "o", "ocho", "otra", "otras", "otro", "otros",
            "para", "parece", "parte", "partir", "pasada", "pasado", "pero",
            "pesar", "poca", "pocas", "poco", "pocos", "podeis", "podemos",
            "poder", "podrá", "podrán", "podria", "podría", "podriais",
            "podriamos", "podrian", "podrían", "podrias", "poner", "por",
            "porque", "por qué", "posible", "primer", "primera", "primero",
            "primeros", "principalmente", "propia", "propias", "propio",
            "propios", "próximo", "próximos", "pudo", "pueda", "puede",
            "pueden", "puedo", "pues", "que", "qué", "quedó", "queremos",
            "quien", "quién", "quienes", "quiere", "realizado", "realizar",
            "realizó", "respecto", "sabe", "sabeis", "sabemos", "saben",
            "saber", "sabes", "se", "sea", "sean", "según", "segunda",
            "segundo", "seis", "señaló", "ser", "será", "serán", "sería", "si",
            "sí", "sido", "siempre", "siendo", "siete", "sigue", "siguiente",
            "sin", "sino", "sobre", "sois", "sola", "solamente", "solas",
            "solo", "sólo", "solos", "somos", "son", "soy", "su", "sus", "tal",
            "también", "tampoco", "tan", "tanto", "tardes", "tarde", "tendrá",
            "tendrán", "teneis", "tenemos", "tener", "tenga", "tengo", "tenía",
            "tenido", "tercera", "tiempo", "tiene", "tienen", "toda", "todas",
            "todavía", "todo", "todos", "total", "trabaja", "trabajais",
            "trabajamos", "trabajan", "trabajar", "trabajas", "trabajo",
            "tras", "trata", "través", "tres", "tuvo", "tuyo", "tu", "te",
            "pq", "mas", "qie", "us", "has", "ti", "ahi", "mis", "tus", "do",
            "X", "Ven", "mo", "Don", "dia", "PT", "sua", "q", "x", "i",
            "última", "últimas", "ultimo", "último", "últimos", "un", "una",
            "unas", "uno", "unos", "usa", "usais", "usamos", "usan", "usar",
            "usas", "uso", "usted", "va", "vais", "valor", "vamos", "van",
            "varias", "varios", "vaya", "veces", "ver", "verdad", "verdadera",
            "verdadero", "vez", "vosotras", "n", "s", "of", "c", "the", "m",
            "qu", "to", "as", "is", "asi", "via", "sera", "tambien",
            "vosotros", "voy", "y", "ya", "yo"
        ])).union(set(stopwords.words('spanish')))
Exemplo n.º 32
0
   pessoa = sentenca[0] 
   frase = "".join(sentenca[1:])
   
   falas.append(dict(pessoa=pessoa, frase=frase))


declarante = [
   fala['frase'].decode('utf-8') 
   for fala in falas 
   if fala['pessoa'] == 'Declarante'
]

declarante = "".join(declarante).lower()
declarante = declarante.replace(u"não sei", u"nãosei")

swords = ['que', 'eu', u'não', 'da', 'de', 'por', 'ele', u'você', u'está',
          'tem', 'um', 'uma', 'se', 'foi', u'lá', 'pra', 'para', 'vai', 
          u'já', 'na', 'era', 'em', u'aí', 'minha', u'nós', 'os', 'as', 
          'ou', 'essa', 'isso', 'como', 'aqui', 'pois', u'só', 'quando', 
          u'então', 'muito', 'porque', 'acho', 'nem', 'mais', 'meu',
          'ser', 'estou', 'vou', 'coisa', 'tenho', 'tinha', 'ter', u'quem'
          'fui', 'mas', u'são', 'muita', 'mim', 'tudo', 'toda', 'todo', 
          'deve', 'falar', 'eles', 'das']

STOPWORDS = STOPWORDS.union(swords)

wordcloud = WordCloud(width=800, height=400,
                      stopwords=STOPWORDS).generate(declarante)

wordcloud.to_file("wordcloud.png")
Exemplo n.º 33
0
# So now that we know more about lenders location, let's analyze the textual freeform column *loan_because* and construct a wordcloud to get an insight about their motives for funding proejcts on Kiva.

# In[24]:


import matplotlib as mpl 
from wordcloud import WordCloud, STOPWORDS
import imageio

heart_mask = imageio.imread('../input/poverty-indicators/heart_msk.jpg') #because displaying this wordcloud as a heart seems just about right :)

mpl.rcParams['figure.figsize']=(12.0,8.0)    #(6.0,4.0)
mpl.rcParams['font.size']=10                #10 

more_stopwords = {'org', 'default', 'aspx', 'stratfordrec','nhttp','Hi','also','now','much','username'}
STOPWORDS = STOPWORDS.union(more_stopwords)

lenders_reason = lenders[~pd.isnull(lenders['loan_because'])][['loan_because']]
lenders_reason_string = " ".join(lenders_reason.loan_because.values)

wordcloud = WordCloud(
                      stopwords=STOPWORDS,
                      background_color='white',
                      width=3200, 
                      height=2000,
                      mask=heart_mask
            ).generate(lenders_reason_string)

plt.imshow(wordcloud)
plt.axis("off")
plt.savefig('./reason_wordcloud.png', dpi=900)
Exemplo n.º 34
0
def info(request):	
	if request.method == 'GET' and 'screen_name' in request.GET:
		scn = request.GET['screen_name']
		if scn == "":
			return render(request, 'tweets/login.html', {'message':'Enter a valid Twitter handle'})
		else:
			# print("start", datetime.now())
			STAT_PATH = os.path.join(settings.BASE_DIR, 'tweets/static/tweets/')

			handle = tweepy.OAuthHandler(settings.CONSUMER_KEY, settings.CONSUMER_SECRET)
			handle.set_access_token(settings.ACCESS_TOKEN, settings.ACCESS_TOKEN_SECRET)
			api = tweepy.API(handle)
			try:
				user = api.get_user(screen_name=scn)
			except tweepy.TweepError:
				return render(request, 'tweets/login.html', {'message':'Enter a valid Twitter handle'})


			"""
			Tweets vs Weekday graph 
			"""
			try:
				timeline = api.user_timeline(screen_name=scn, count=3200, include_rts=True)
				mid = timeline[-1].id - 1
				while True:
					tl = api.user_timeline(screen_name=scn, count=3200, include_rts=True, max_id=mid)
					if not len(tl):
						break
					timeline += tl
					mid = tl[-1].id - 1
			except tweepy.TweepError or TypeError:
				return render(request, 'tweets/login.html', {'message':'Enter a valid Twitter handle'})

			y = [0]*7
			for tw in timeline:
				d = tw.created_at.strftime("%w")
				y[int(d)] += 1

			x = [0,1,2,3,4,5,6]
			xpoints = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
			plt.xticks(x, xpoints)
			plt.plot(x, y, 'b-')
			plt.xlabel('Days of week')
			plt.ylabel('No. of tweets')
			
			path_graph = STAT_PATH + 'graph.png'
			if os.path.isfile(path_graph):
				os.remove(path_graph)

			plt.savefig(path_graph, dpi=300, bbox_inches='tight')
			plt.clf()

			"""
			Tag-cloud
			"""
			# more stopwords
			file = open(STAT_PATH + 'stopwords.txt', 'r')
			more_stops = file.readlines()
			for i in range(len(more_stops)):
				more_stops[i] = more_stops[i].rstrip('\n')
			global STOPWORDS
			STOPWORDS = STOPWORDS.union(more_stops)

			words = []
			matrix = []
			for tw in timeline:
				matrix.append(tw.text.split())
				words = words + tw.text.split()
			
			long_tweet_stripd = ""
			for w in words:
				if w != 'RT' and not(w.startswith('http')) and not(w.startswith('@')) and not(w.startswith('#')) and not(w.lower() in STOPWORDS):
					long_tweet_stripd = " ".join([long_tweet_stripd, w.lower()])
					
			un_words = long_tweet_stripd.split()

			mask = imread(STAT_PATH + 'twitter_mask.png')

			wcloud = WordCloud(max_words=50, background_color='white', stopwords=STOPWORDS, mask=mask).generate(long_tweet_stripd)
			
			# print(long_tweet_stripd)

			path_wordcloud = STAT_PATH + 'wordcloud.png'
			if os.path.isfile(path_wordcloud):
				os.remove(path_wordcloud)

			plt.imshow(wcloud)
			plt.gca().invert_yaxis()
			plt.axis('off')
			plt.savefig(path_wordcloud, dpi=600, bbox_inches='tight')
			plt.clf()
			plt.close()

			"""
			Word co-occurences matrix

			"""
			all_words_use = []
			for w in un_words:
				try:
					if all(((ord(char)>=65 and ord(char)<=90) or (ord(char)>=97 and ord(char)<=122)) for char in w) and (not(w.lower() in STOPWORDS)):
						all_words_use.append(w.lower())
				except Exception as e:
					pass

			un_words_use = list(set(all_words_use))

			un_words_use_count = [0 for i in range(len(un_words_use))]
			for w in all_words_use:
				if w in un_words_use:
					un_words_use_count[un_words_use.index(w)] += 1

			most_words_use = []
			for i in range(30):
				if len(un_words_use_count) == 0:
					break;
				most_words_use.append(un_words_use[un_words_use_count.index(max(un_words_use_count))])
				un_words_use_count.remove(un_words_use_count[un_words_use_count.index(max(un_words_use_count))])

			most_words_use = list(set(most_words_use))

			count = [[0 for i in range(len(most_words_use))] for j in range(len(most_words_use))]

			for i in range(len(matrix)):
				for j in range(len(matrix[i])):
					if matrix[i][j].lower() in most_words_use:
						for k in range(j+1, len(matrix[i])):
							if matrix[i][k].lower() in most_words_use:
								count[most_words_use.index(matrix[i][j].lower())][most_words_use.index(matrix[i][k].lower())] += 1;
								count[most_words_use.index(matrix[i][k].lower())][most_words_use.index(matrix[i][j].lower())] += 1;
			
			nodeFile = open(STAT_PATH + 'nodeFile.csv', 'w', newline='')
			nodeW = csv.writer(nodeFile, quotechar='|', quoting=csv.QUOTE_MINIMAL)
			nodeW.writerow(['id'])
			for w in most_words_use:
				nodeW.writerow([w])

			edgeFile = open(STAT_PATH + 'edgeFile.csv', 'w', newline='')
			edgeW = csv.writer(edgeFile, quotechar='|', quoting=csv.QUOTE_MINIMAL)
			edgeW.writerow(['source']+['target']+['weight'])
			for i in range(len(count)):
				for j in range(i, len(count)):
					edgeW.writerow([most_words_use[i]]+[most_words_use[j]]+[count[i][j]])
					edgeW.writerow([most_words_use[j]]+[most_words_use[i]]+[count[i][j]])
			
			nodeFile.close()
			edgeFile.close()

			print("3 tasks completed", datetime.now())

			"""
			Network graph
			"""
			obj = {}
			obj['nodes'] = []
			obj['links'] = []
			obj['nodes'].append({'name':scn, 'group':1})

			follower_ids = api.followers_ids(screen_name=scn, count=100)
			follower_users = []
			if len(follower_ids) > 0:
				follower_users = api.lookup_users(user_ids=follower_ids)

			i=1; # node no., user is node0 
			for u in follower_users:
				obj['nodes'].append({'name':u.screen_name, 'group':2}) # group2 for followers
				obj['links'].append({'source':i, 'target':0, 'weight':1})
				i += 1

			following_ids = api.friends_ids(screen_name=scn, count=100)
			following_users = []
			if len(following_ids) > 0:
				following_users = api.lookup_users(user_ids=following_ids)

			for u in following_users:
				obj['nodes'].append({'name':u.screen_name, 'group':3}) # group3 for friends/following
				obj['links'].append({'source':0, 'target':i, 'weight':1})
				i += 1

			# Exceeds rate-limit, max 15 requests per 15-min interval
			# and 100 api requests per hour
			# Will have to decrease follower and friend count
			# all_users = follower_users + following_users
			# for i in range(len(all_users)):
			# 	frs = api.followers_ids(screen_name=all_users[i].screen_name)
			# 	fwn = api.friends_ids(screen_name=all_users[i].screen_name)
			# 	for j in range(i+1, len(all_users)):
			# 		if all_users[j].id in frs:
			# 			obj['links'].append({'source':j+1, 'target':i+1, 'weight':1})
			# 		if all_users[j].id in fwn:
			# 			obj['links'].append({'source':i+1, 'target':j+1, 'weight':1})



			with open(STAT_PATH + 'network.json', 'w') as jsonFile:
				json.dump(obj, jsonFile, indent=4)

			return render(request, 'tweets/info.html', {'user':user, 'day_list':y})
	else:
		return render(request, 'tweets/login.html', {'message':'Enter a valid Twitter handle'})