예제 #1
0
    def create_word_frequency(data):
        tweet_freq = {}
        for tweet in data:
            replies = data[tweet]['0'] + data[tweet]['50'] + data[tweet]['100']
            for x in range(len(replies)):
                replies[x] = preprocess(replies[x])

            whole_doc = ' '.join(replies)
            all_words = whole_doc.split(' ')
            all_words_set = list(set(all_words))
            temp = []
            for word in all_words_set:
                temp_w = ''
                for char in word:
                    if char.isalpha():
                        temp_w += char
                temp.append(temp_w.lower())
            all_words_set = temp
            # all_words_set=[word.strip('\n').lower().replace('#','').replace('@','').replace('\n',' ') for word in all_words_set]
            freq = {}
            # print(STOPWORDS)
            for word in all_words_set:
                STOPWORDS.update([''])
                if word not in STOPWORDS:
                    freq[word] = all_words.count(word)
        # freq=collections.OrderedDict(freq)
            sorted_x = sorted(freq.items(), key=lambda kv: kv[1])
            tweet_freq[tweet] = sorted_x
            # print(sorted_x[-1:30:-1])
        return tweet_freq
예제 #2
0
 def generate(self):
     self.ignored_words = set()
     stopwords_from_file = self.stopwords_file.read()
     for word in stopwords_from_file.split():
         self.ignored_words.add(word)
     self.stopwords_file.close()
     self.ignored_words = set.union(many_stop_words.get_stop_words("ja"),
                                    self.ignored_words)
     longstring = ""
     if self.mask_img:
         mask = np.array(self.mask_img)
     else:
         mask = None
     amount_scs = 0
     for superchat in self.sc_log:
         if superchat["message"]:
             amount_scs += 1
             if '_' not in superchat["message"]:
                 mecabbed = do_mecab(superchat["message"], '-Owakati')
                 longstring += " " + mecabbed
     print("generating wordcloud from %d messages", amount_scs)
     STOPWORDS.update(self.ignored_words)
     wordcloud = WordCloud(font_path=self.font,
                           collocations=False,
                           background_color="white",
                           width=1280,
                           height=720,
                           mask=mask).generate(longstring)
     if isinstance(self.logpath, Path):
         dest_image = self.target_dir + self.logpath.stem + "-wordcloud.png"
     else:
         dest_image = self.target_dir + self.logpath + "-wordcloud.png"
     wordcloud.to_file(dest_image)
예제 #3
0
def create_word_cloud(string, picName):
    maskArray = npy.array(Image.open("cloud.jpg"))
    STOPWORDS.update({"management", "knowledge", "environment", "system"})
    cloud = WordCloud(background_color="black",
                      max_words=200,
                      mask=maskArray,
                      stopwords=STOPWORDS)
    cloud.generate(string)
    cloud.to_file("%s.jpg" % (picName))
def word_cloud(newsag, cloud_title):
    """Word cloud generating"""

    STOPWORDS.update({
        'в', 'на', 'за', 'c', 'к', 'до', 'для', 'что', 'кроме'
        'того', 'также', 'по', 'он', 'она', 'него', 'ему', 'им', 'о', 'об',
        'его', 'так', 'все', 'только', 'как', 'сейчас', 'мы', 'был', 'надо',
        'когда', 'это', 'будет', 'было', 'рф', 'россия', 'из', 'от', 'россии',
        'за', 'нет', 'если', 'чтобы', 'сказал', 'меня', 'их', 'уже', 'или',
        'после', 'лет', 'были', 'который', 'где', 'при', 'вы', 'согласно',
        'того', 'этого', 'всегда', 'однако', 'есть', 'очень', 'может', 'меня',
        'мне', 'не', 'ее', 'раз', 'да', 'вот', 'но', 'этом', 'кроме',
        'которая', 'была', 'чем', 'там', 'еще', 'тогда', 'они', 'бы', 'же',
        'год', 'году', 'года', 'годом', 'годе', 'ранее', 'потом', 'даже',
        'которые', 'то', 'пока', 'через', 'со', 'ну', 'ли', 'более', 'можно',
        'всего', 'тем', 'себя', 'из', 'из-за', 'во', 'будут', 'поэтому',
        'перед', 'том', 'теперь', 'этот', 'кто', 'быть', 'у', 'нас',
        'понедельник', 'вторник', 'среда', 'четверг', 'пятница', 'суббота',
        'воскресенье', 'январь', 'февраль', 'март', 'апрель', 'май', 'июнь',
        'июль', 'август', 'сентябрь', 'октябрь', 'ноябрь', 'декабрь', 'хотя',
        'этого', 'эти', 'этой', 'этих', 'этим', 'ни', 'весь', 'тот', 'свой',
        'такой', 'какой', 'ещё', 'еще', 'один', 'два', 'человек', 'изз',
        'изза', 'наш', 'наши', 'нашим', 'нашему', 'нашими', 'нами', 'нам',
        'мой', 'моему', 'время', 'страна', 'день', 'слово', 'работа', 'слово',
        'другой', 'ТАСС', 'сообщил', 'отметил', 'дать', 'быть', 'тысяч',
        'тысяча', 'заявил', 'ситуация', 'должный', 'гражданин', 'россиянин',
        'мая', 'тоже', 'без', 'тыс'
    })
    stopwords = STOPWORDS

    wordcloud = WordCloud(max_font_size=60,
                          background_color='white',
                          max_words=250,
                          width=250,
                          stopwords=stopwords,
                          height=150,
                          scale=10).generate(newsag)

    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.axis("off")
    plt.tight_layout(True)
    plt.title(cloud_title,
              fontdict={
                  'fontsize': 18,
                  'fontweight': 'semibold'
              },
              loc='center',
              pad=20)
    return wordcloud
    def run(self):

        with self.step("Grouping texts"):

            table = self.parameters["user_prefix"] + self.task_parameters[
                "table"]

            df = pd.DataFrame(
                self.default_db.read_data(f"SELECT * FROM {table}"))
            full_text = " ".join(article for article in df.summary)

            sources = df.groupby("source")
            grouped_texts = sources.summary.sum()

        with self.step("Generating clouds"):

            stopwords = STOPWORDS.update(self.parameters["stopwords"])
            self.info("Generating bbc_wordcloud.png")
            self.word_cloud("bbc",
                            full_text,
                            stopwords,
                            b_colour="white",
                            c_colour="black")

            # Source specific wordclouds

            for group, text in zip(grouped_texts.keys(), grouped_texts):
                self.info(f"Generating {group}_wordcloud.png")
                self.word_cloud(group, text, stopwords)

        return self.success()
def plot_wordcloud(data, sentiment):
    combined_text = " ".join([review for review in data['train'][sentiment]])

    wc = WordCloud(background_color='white',
                   max_words=50,
                   stopwords=STOPWORDS.update(['br', 'film', 'movie']))

    plt.imshow(wc.generate(combined_text))
    plt.axis('off')
    plt.show()
예제 #7
0
def create_word_cloud(data, sentiment):
    #combine all reviews
    combined_text = " ".join([review for review in data['train'][sentiment]])
    #produce word cloud minus common stop words
    wc = WordCloud(background_color='white',
                   max_words=50,
                   stopwords=STOPWORDS.update(['br', 'film', 'movie']))
    generated_wc = wc.generate(combined_text)
    #plt.imshow(generated_wc)
    #plt.axis('off')
    #plt.show()
    generated_wc.to_file("./img/test-{}.png".format(sentiment))
예제 #8
0
def word_c(author_v):
    tweets_hm3 = tweets_hm[tweets_hm['author_name'] == author_v][['message']]
    text = tweets_hm3['message']
    wordcloud = WordCloud(
    width = 1000,
    height = 450,
    background_color = 'white',
    stopwords = STOPWORDS.update(['u00e7','u00f5es','u00e7o'])).generate(str(text))
    wordcloud.to_file('wc_author.png')
    test_png = 'wc_author.png'
    test_base64 = base64.b64encode(open(test_png, 'rb').read()).decode('ascii')
    return 'data:image/png;base64,{}'.format(test_base64)
예제 #9
0
def word_c2(year_v):
    text = tweets[['message','created_at']]
    text['created_at'] = pd.to_datetime(text['created_at'],utc=True)
    text = text[text['created_at'].dt.year == year_v]['message']
    wordcloud = WordCloud(
    width = 1000,
    height = 450,
    background_color = 'white',
    stopwords = STOPWORDS.update(['u00e7','u00f5es','u00e7o'])).generate(str(text))
    wordcloud.to_file('wc_overall.png')
    test_png = 'wc_overall.png'
    test_base64 = base64.b64encode(open(test_png, 'rb').read()).decode('ascii')
    return 'data:image/png;base64,{}'.format(test_base64)
예제 #10
0
 def generate(self):
     conn = psycopg2.connect(dbname=self.pgsql_creds["database"],
                             user=self.pgsql_creds["username"],
                             host=self.pgsql_creds["host"],
                             password=self.pgsql_creds["password"])
     cur = conn.cursor()
     cur.execute("SELECT message_txt FROM messages WHERE video_id = %s;",
                 (self.video_id, ))
     results = cur.fetchall()
     conn.close()
     self.ignored_words = set()
     stopwords_from_file = self.stopwords_file.read()
     for word in stopwords_from_file.split():
         self.ignored_words.add(word)
     self.stopwords_file.close()
     self.ignored_words = set.union(many_stop_words.get_stop_words("ja"),
                                    self.ignored_words)
     longstring = ""
     if self.mask_img:
         mask = np.array(self.mask_img)
     else:
         mask = None
     amount_scs = 0
     for superchat in results:
         if superchat[0]:
             amount_scs += 1
             if '_' not in superchat[0]:
                 mecabbed = do_mecab(superchat[0], '-Owakati')
                 longstring += " " + mecabbed
     print("generating wordcloud from %d messages", amount_scs)
     STOPWORDS.update(self.ignored_words)
     wordcloud = WordCloud(font_path=self.font,
                           collocations=False,
                           background_color="white",
                           width=1280,
                           height=720,
                           mask=mask).generate(longstring)
     dest_image = self.target_dir + self.video_id + "-wordcloud.png"
     wordcloud.to_file(dest_image)
def custom_wordcloud_draw(text, color = 'white'):
    """
    Plots wordcloud of string text after removing stopwords
    """
    cleaned_word = " ".join([word for word in text.split()])
    wordcloud = WordCloud(stopwords= STOPWORDS.update(['using', 'based', 'analysis', 'study', 'research', 'viruses']),
                      background_color=color,
                      width=1000,
                      height=1000
                     ).generate(cleaned_word)
    plt.figure(1,figsize=(8, 8))
    plt.imshow(wordcloud)
    plt.axis('off')
    display(plt.show())
def generate_wordCloud(textPath, wordCloudPath, ice_mask):

    ice_cloud = WordCloud(background_color='white',
                          width=1366,
                          height=768,
                          mask=ice_mask,
                          stopwords=STOPWORDS.update([
                              '中奖率', '录取', '通知书', '吧', '我', '热词', '系列', '热词系列',
                              ' ', '增加', '拉低', '怎么', '这个', '回复', '哈哈', '的',
                              '在', '抽个', '是', '啊', '这', '吗', '啦', '她', '小',
                              '你', '了', '来', '抽', '知识', '知识增加', '好家伙', '拉低中奖率',
                              '打', '抽我', '妙啊', '滑稽', '呆'
                          ]),
                          font_path='C:\Windows\Fonts\simfang.ttf',
                          max_words=250)

    words = open(textPath, 'r', encoding='utf-8').read()

    ice_cloud.generate(words)
    ice_cloud.to_file(wordCloudPath)
예제 #13
0
import numpy as np
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS

rng = np.random.RandomState(34)


def color_fn(*args, **kwargs):
    r = rng.randint(16, 240)
    g = rng.randint(r // 4 + 1)
    b = 0
    return (r, g, b)


STOPWORDS.update(["acm", "e", "g"])

wc = WordCloud(
    font_path="./assets/fonts/Roboto-Regular.ttf",
    width=500,
    height=240,
    margin=2,
    background_color="white",
    color_func=color_fn,
    max_words=100,
    stopwords=STOPWORDS,
    min_font_size=4,
    max_font_size=40,
    random_state=34,
).generate(text.strip().lower())
예제 #14
0
파일: ucd.py 프로젝트: fdft/wordCloudArt
    #return "hsl(%d, %d%%, %d%%)" % (H, S, B)
    if rv == 0:
	return "hsl(179, 96%, 55%)"
    elif rv == 1:
        return "hsl(187, 99%, 56%)"
    elif rv == 2 :
        return "hsl(174, 100%, 66%)"
    elif rv == 3:
        return "hsl(176, 99%, 76%)"
    else :
        return "hsl(63, 22%, 95%)"
    #return "hsl(%d, 80%%, 50%%)" % random_state.randint(0, 255)
    

wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask,
               stopwords=STOPWORDS.update(wordToRemove), color_func=maria_color_fun,
               min_font_size=8)

print("going to generate the cloud")
# generate word cloud
wc.generate(text)
print(wc.max_words)
# store to file
wc.to_file(path.join(d, OUTPUTFILE))

# show
plt.imshow(wc)
#plt.axis("off")
#plt.figure()
#plt.imshow(alice_mask, cmap=plt.cm.gray)
plt.axis("off")
예제 #15
0
stopwords = STOPWORDS.update([
    'audience',
    'fail',
    'site',
    "about",
    "above",
    "across",
    "after",
    "again",
    "against",
    "aimed",
    "all",
    "almost",
    "alone",
    "along",
    "already",
    "also",
    "although",
    "always",
    "among",
    "an",
    "and",
    "another",
    "any",
    "anybody",
    "anyone",
    "anything",
    "anywhere",
    "are",
    "area",
    "areas",
    "around",
    "as",
    "ask",
    "asked",
    "asking",
    "asks",
    "at",
    "away",
    "back",
    "backed",
    "backing",
    "backs",
    "be",
    "became",
    "because",
    "become",
    "becomes",
    "been",
    "before",
    "began",
    "behind",
    "being",
    "beings",
    "best",
    "better",
    "between",
    "big",
    "both",
    "br",
    "but",
    "by",
    "came",
    "can",
    "cannot",
    "case",
    "cases",
    "certain",
    "certainly",
    "clear",
    "clearly",
    "click",
    "column",
    "come",
    "could",
    "data-frame",
    "data-frames",
    "data",
    "database",
    "databases",
    "dataframe",
    "dataframes",
    "dataset",
    "datasets",
    "df",
    "did",
    "differ",
    "different",
    "differently",
    "div",
    "do",
    "docs",
    "does",
    "done",
    "down",
    "down",
    "downed",
    "downing",
    "downs",
    "during",
    "each",
    "early",
    "either",
    "end",
    "ended",
    "ending",
    "ends",
    "enjoy",
    "enough",
    "env",
    "even",
    "evenly",
    "ever",
    "every",
    "everybody",
    "everyone",
    "everything",
    "everywhere",
    "face",
    "faces",
    "fact",
    "facts",
    "far",
    "felt",
    "few",
    "find",
    "finds",
    "first",
    "for",
    "four",
    "from",
    "full",
    "fully",
    "further",
    "furthered",
    "furthering",
    "furthers",
    "gave",
    "general",
    "generally",
    "get",
    "gets",
    "githubusercontent",
    "give",
    "given",
    "gives",
    "go",
    "going",
    "good",
    "goods",
    "got",
    "great",
    "greater",
    "greatest",
    "group",
    "grouped",
    "grouping",
    "groups",
    "had",
    "has",
    "have",
    "having",
    "he",
    "help",
    "her",
    "here",
    "herself",
    "high",
    "high",
    "high",
    "higher",
    "highest",
    "him",
    "himself",
    "his",
    "hope",
    "how",
    "however",
    "html",
    "http",
    "http",
    "https",
    "https",
    "if",
    "image",
    "images",
    "import",
    "important",
    "in",
    "interest",
    "interested",
    "interesting",
    "interests",
    "into",
    "is",
    "it",
    "item",
    "its",
    "itself",
    "jpg",
    "just",
    "keep",
    "keeps",
    "kind",
    "knew",
    "know",
    "known",
    "knows",
    "large",
    "largely",
    "last",
    "later",
    "latest",
    "least",
    "leave",
    "less",
    "let",
    "lets",
    "like",
    "likely",
    "link",
    "log",
    "long",
    "longer",
    "longest",
    "made",
    "make",
    "making",
    "man",
    "many",
    "may",
    "me",
    "member",
    "members",
    "men",
    "might",
    "more",
    "most",
    "mostly",
    "mr",
    "mrs",
    "much",
    "must",
    "my",
    "myself",
    "n",
    "necessary",
    "need",
    "needed",
    "needing",
    "needs",
    "never",
    "new",
    "new",
    "newer",
    "newest",
    "next",
    "no",
    "nobody",
    "non",
    "noone",
    "not",
    "nothing",
    "now",
    "nowhere",
    "null",
    "number",
    "numbers",
    "of",
    "off",
    "often",
    "old",
    "older",
    "oldest",
    "on",
    "once",
    "one",
    "only",
    "open",
    "opened",
    "opening",
    "opens",
    "or",
    "order",
    "ordered",
    "ordering",
    "orders",
    "org",
    "other",
    "others",
    "our",
    "out",
    "over",
    "part",
    "parted",
    "parting",
    "parts",
    "pd",
    "per",
    "perhaps",
    "place",
    "places",
    "please",
    "png",
    "point",
    "pointed",
    "pointing",
    "points",
    "possible",
    "post",
    "present",
    "presented",
    "presenting",
    "presents",
    "print",
    "problem",
    "problems",
    "PS",
    "put",
    "puts",
    "quite",
    "rather",
    "really",
    "refdef",
    "reference",
    "references",
    "review",
    "right",
    "right",
    "room",
    "rooms",
    "said",
    "same",
    "saw",
    "say",
    "says",
    "second",
    "seconds",
    "see",
    "seem",
    "seemed",
    "seeming",
    "seems",
    "sees",
    "self",
    "several",
    "shall",
    "she",
    "should",
    "show",
    "showed",
    "showing",
    "shows",
    "side",
    "sides",
    "sign",
    "signed",
    "since",
    "small",
    "smaller",
    "smallest",
    "so",
    "some",
    "somebody",
    "someone",
    "something",
    "somewhere",
    "state",
    "states",
    "still",
    "still",
    "story",
    "style",
    "such",
    "sure",
    "table",
    "take",
    "taken",
    "td",
    "than",
    "that",
    "the",
    "their",
    "them",
    "then",
    "there",
    "Thereafter",
    "therefore",
    "these",
    "they",
    "thing",
    "things",
    "think",
    "thinks",
    "this",
    "those",
    "though",
    "thought",
    "thoughts",
    "three",
    "through",
    "thus",
    "time",
    "to",
    "today",
    "together",
    "too",
    "took",
    "toward",
    "tr",
    "turn",
    "turned",
    "turning",
    "turns",
    "two",
    "under",
    "until",
    "up",
    "upon",
    "us",
    "use",
    "used",
    "user-images",
    "user",
    "users",
    "uses",
    "using",
    "very",
    "video",
    "w",
    "walk"
    "want",
    "wanted",
    "wanting",
    "wants",
    "was",
    "way",
    "ways",
    "we",
    "well",
    "wells",
    "went",
    "were",
    "what",
    "when",
    "where",
    "whether",
    "which",
    "while",
    "who",
    "whole",
    "whose",
    "why",
    "will",
    "wine",
    "with",
    "within",
    "without",
    "work",
    "worked",
    "working",
    "works",
    "would",
    "yaml",
    "year",
    "years",
    "yet",
    "you",
    "young",
    "younger",
    "youngest",
    "your",
    "yours",
])
def cloudyscience(idnames):

    # colormap version
    def cmap_color_func(word, font_size, position, orientation,alph,random_state):
        colr = cm.jet(random.randint(0, 255))
        colr2 = colorsys.rgb_to_hsv(colr[0],colr[1],colr[2])
        sat = "70%" 
        var = "%s" % alph +"%"
        return "hsl(%d, %s, %s)" % (int(float(colr2[0])*255),sat,var)

    stp = ['1','2','3','4','5','6','7','8','9','0','10','et','al',
           'a','b','c','d','e','f','g','h','i','j',
           'k','l','m','n','o','p','q','r','s','t',
           'u','v','w','x','y','z']


    # Shuffle randomly
    random.shuffle(idnames)
    papernum = len(idnames)

    print 'PROCESSING PAPERS...'

    for idname in idnames:

        print idname

        # Make temp file
        os.system('rm -rf temp')
        os.system('mkdir temp')

        ######## Wordle post ########

        pdfpath = 'http://arxiv.org/pdf/'+idname
        out = open('temp/test.pdf','w')
        print "Downloading paper..."
        out.write(urllib.urlopen(pdfpath).read())
        out.flush()

        # Convert the PDF to text
        print "Processing..."
        os.system('pdftotext temp/test.pdf > temp/test.txt')

        # Turn into a Wordle
        text = open('temp/test.txt').read()
        outname = '%s_wordle.png' % idname

        # Initalise wordcloud
        wc = WordCloud(stopwords=STOPWORDS.update(stp),width=500,height=500,prefer_horizontal=0.95,scale=1,color_func=cmap_color_func,font_path='DensiaSans.otf')

        # Check text    
        if len(text) > 0:
            wc.generate(text)
        else:
            print 'PDF text corrupted... skipping this paper'
            continue

        imshow(wc)
        axis("off")
        wc.to_file(outname)

    return 
예제 #17
0
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import os
import numpy as np

d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()

with open("wordcloud-beispieltext.txt") as f:
    text = f.read()


#text = 'Python Kurs: mit Python programmieren lernen für Anfänger und Fortgeschrittene Dieses Python Tutorial entsteht im Rahmen von Uni-Kursen und kann hier kostenlos genutzt werden. Python ist eine für Anfänger und Einsteiger sehr gut geeignete Programmiersprache, die später auch den Fortgeschrittenen und Profis alles bietet, was man sich beim Programmieren wünscht. Der Kurs ist eine Einführung und bietet einen guten Einstieg. Es wird aktuelles Wissen vermittelt - daher schreiben wir unseren Python-Code mit der aktuellen Python-Version 3. einfach Python lernen über das Programmieren von Spielen Damit Python programmieren lernen noch mehr Spaß macht, werden wir im Kurs anhand verschiedener Spiele die Anwendung von Python kennen lernen und unser Wissen als Programmierer aufbauen. Die Grundlagen werden direkt umgesetzt in bekannte Spiele wie:'

# Generate a word cloud image
uninteressant = "und von Der das denn wir ist die auf im"
liste_der_unerwuenschten_woerter = uninteressant.split()
STOPWORDS.update(liste_der_unerwuenschten_woerter)

x, y = np.ogrid[:1000, :1000]
mask = (x - 500) **2 + (y - 500) **2 > 400 ** 2
mask = 255 * mask.astype(int)
wordcloud = WordCloud(width=1920, height=1080, mask=mask, max_words=2000).generate(text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
import pandas as pd
from tqdm import tqdm
import numpy as np
import collections, glob,itertools
from wordcloud import WordCloud, STOPWORDS

import seaborn as sns
import pylab as plt

new_words=["amp","nt","re","will","I"]
STOPWORDS.update(new_words)

new_words="U de e di en che la att il u de la die en que le les un via et der m und des pas je du y na ve "
#STOPWORDS.update(new_words.split())

F_CSV = glob.glob("results/qtext_*")
DF = {}
for f in sorted(F_CSV):
    print f
    df = pd.read_csv(f).set_index('word')   
    df = df.drop(STOPWORDS)
    df.n /= df.n.sum()*1.0


    #words = (df.delta*100000).astype(int)
    #words = words[words>2]

    DF[f] = df
    #C = collections.Counter(dict(zip(df.index,df.n)))
    #DF[f] = C
#Import of relevant packages
import csv
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd

# Load the *data*.csv for the WordCloud
data = pd.read_csv(r"C:\Users\Paul\Desktop\Code\WordCloud_Generation\kicker_bl_li_tweets.csv")

# Load a List of Stopwords for filtering
Ignore = open(r"C:\Users\Paul\Desktop\Code\WordCloud_Generation\Stopwords.txt",'r').read().split()

#Set and update Stopwords
STOPWORDS.update(Ignore)
stopwords = set(STOPWORDS)

# Generation of a WordCloud image with a set of customization
wordcloud = WordCloud(
    background_color='white',
    stopwords=stopwords,
    max_words=5000,
    width=1920,
    height=1080,
    max_font_size=1000

                      ).generate(str(data))
#Plotter

plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
예제 #20
0
def create_wordcloud(yt_url):
    print("NOT HERE")
    import os
    from os import path
    import json
    import googleapiclient.discovery
    from wordcloud import WordCloud, STOPWORDS
    import re

    import numpy as np
    # Python code for youtube.commentThreads.list
    # See instructions for running these code samples locally:
    # https://developers.google.com/explorer-help/guides/code_samples#python

    VIDEO_ID = yt_url

    # Disable OAuthlib's HTTPS verification when running locally.
    # *DO NOT* leave this option enabled in production.
    # os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

    api_service_name = "youtube"
    api_version = "v3"
    DEVELOPER_KEY = os.environ.get('DEVELOPER_KEY')
    # VIDEO_ID = "nMwbe45OiIg"

    youtube = googleapiclient.discovery.build(api_service_name,
                                              api_version,
                                              cache_discovery=False,
                                              developerKey=DEVELOPER_KEY)

    # Set the current directory
    d = os.path.dirname(__file__)

    # Variable to hold the total number of comments
    total_comments = 0

    with open(path.join(d, 'wordcloud_text.txt'), "w") as write_file:

        request = youtube.commentThreads().list(
            part="snippet,replies",
            maxResults=100,
            videoId=VIDEO_ID,
        )
        response = request.execute()

        # Write the first set of 100 comments to the textFile
        items = response["items"]

        for item in items:
            this_item = item["snippet"]
            write_file.write(
                this_item["topLevelComment"]["snippet"]["textDisplay"])
            total_comments += 1

        # Retrieve all the rest of the pages
        if "nextPageToken" in response:
            nextPageToken = response["nextPageToken"]
            another_page = True
        else:
            another_page = False

        while another_page:  #nextPageToken:
            request = youtube.commentThreads().list(part="snippet,replies",
                                                    maxResults=100,
                                                    videoId=VIDEO_ID,
                                                    pageToken=nextPageToken)
            response = request.execute()

            items = response["items"]

            # Write the rest of 100 comments to the textFile
            for item in items:
                this_item = item["snippet"]
                write_file.write(
                    this_item["topLevelComment"]["snippet"]["textDisplay"])
                total_comments += 1

            # if total_comments >= 4000:
            #     another_page = False

            if "nextPageToken" in response:
                nextPageToken = response["nextPageToken"]
            else:
                another_page = False

    # I don't know why, but sometimes the VIDEO_ID gets added, so add VIDEO_ID to STOPWORDS
    if '-' in VIDEO_ID:
        #parse string so that you get both halves of vid id so can add each one to exclude list
        both_halves = VIDEO_ID.split('-')
        STOPWORDS.update(both_halves)

    if len(items
           ) > 0:  # Only process commments if there ARE comments on this video
        # This will be the built in list of words that will not be included in the wordcloud
        STOPWORDS.update([
            'quot', 'amp', 'video', 'search_query', 'https', 'br', 'href',
            'watch', 'youtube', VIDEO_ID
        ])
        stopwords = STOPWORDS
        text = open(path.join(d, 'wordcloud_text.txt')).read()
        wordcloud = WordCloud(background_color='white',
                              width=800,
                              height=600,
                              stopwords=stopwords).generate(text)

        # Display the generated image:
        # the matplotlib way:
        import matplotlib.pyplot as plt
        import io
        import urllib, base64
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        # plt.savefig(path.join(d, 'wordcloud_pic.png'))
        # hello = plt.show()

        # New
        imgdata = io.BytesIO()
        plt.savefig(imgdata, format='png')
        imgdata.seek(0)  # rewind the data
        string = base64.b64encode(imgdata.read())

        uri = 'data:image/png;base64,' + urllib.parse.quote(string)
        return uri
    else:
        return "noComments"  # If the video has 0 comments than it will return this. So that we can display an error to the user
예제 #21
0
#!/usr/bin/python3.7
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import tweepy
from keytwitter import *

text = ''
auth = tweepy.OAuthHandler(key, secret)
api = tweepy.API(auth)

tweeds = api.user_timeline(screen_name='realDonaldTrump', count=100, include_rts = False, tweet_mode = 'extended')
for tweed in tweeds:
    text = text +' '+ tweed.full_text

worldcloud = WordCloud(width=1920, height=1200)
STOPWORDS.update('https','co')
worldcloud.generate(text)
plt.imshow(worldcloud)
plt.axis('off')
plt.show()




예제 #22
0
# проходимся по ссылкам на новости и вгружаем текст новостей, слова сохраняем в text
text = []
for ind in df.index:
    article = Article(df['link'][ind], config=config)
    try:
        article.download()
        article.parse()
        article.nlp()
        sentence = article.text.lower()
        match_pattern = re.findall(r'\b[a-z]{3,15}\b',
                                   sentence)  # из текста берем только слова
        text.extend(match_pattern)
    except Exception:  # некотые ссылки возвращают 403, их пропускаем
        continue

# переводим получившийся список слов в строку
text = ' '.join(text)

# Create and generate a word cloud image:
other = {'russia', 'russian', 'one', 'two'}
STOPWORDS.update(other)
wordcloud = WordCloud(max_font_size=50,
                      max_words=20,
                      stopwords=STOPWORDS,
                      background_color="white").generate(text)
wordcloud.to_file("first_review.png")
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
예제 #23
0
STOPWORDS.update([
    "i",
    "me",
    "my",
    "myself",
    "we",
    "our",
    "ours",
    "ourselves",
    "you",
    "your",
    "yours",
    "yourself",
    "yourselves",
    "he",
    "him",
    "his",
    "himself",
    "she",
    "her",
    "hers",
    "herself",
    "it",
    "its",
    "itself",
    "they",
    "them",
    "their",
    "theirs",
    "themselves",
    "what",
    "which",
    "who",
    "whom",
    "this",
    "that",
    "these",
    "those",
    "am",
    "is",
    "are",
    "was",
    "were",
    "be",
    "been",
    "being",
    "have",
    "has",
    "had",
    "having",
    "do",
    "does",
    "did",
    "doing",
    "a",
    "an",
    "the",
    "and",
    "but",
    "if",
    "or",
    "because",
    "as",
    "until",
    "while",
    "of",
    "at",
    "by",
    "for",
    "with",
    "about",
    "against",
    "between",
    "into",
    "through",
    "during",
    "before",
    "after",
    "above",
    "below",
    "to",
    "from",
    "up",
    "down",
    "in",
    "out",
    "on",
    "off",
    "over",
    "under",
    "again",
    "further",
    "then",
    "once",
    "here",
    "there",
    "when",
    "where",
    "why",
    "how",
    "all",
    "any",
    "both",
    "each",
    "few",
    "more",
    "most",
    "other",
    "some",
    "such",
    "no",
    "nor",
    "not",
    "only",
    "own",
    "same",
    "so",
    "than",
    "too",
    "very",
    "s",
    "t",
    "can",
    "will",
    "just",
    "don",
    "should",
    "now",
])
예제 #24
0
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 19 10:30:19 2016

@author: Tom
"""
from wordcloud import WordCloud, STOPWORDS
STOPWORDS.update([
    'deleted', 'thing', 'still', 'things', 'lot', 'gt', 'reddit', 'really',
    'something', 'https', 'also', 'many', 'even', 'much', 'will'
])
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white')
ORANGERED = '#ff4500'
PERIWINKLE = '#5f99cf'


def word_cloud(df):
    text = ' '.join(df.body.tolist())
    wordcloud = WordCloud(max_font_size=40,
                          stopwords=STOPWORDS,
                          relative_scaling=.5).generate(text)
    plt.figure(figsize=[12, 8])
    plt.imshow(wordcloud)
    plt.axis("off")
    #print wordcloud.words_


def subreddit_bar(df):
예제 #25
0
import tweepy
import wordcloud
from Credentialstwitter import *
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

auth = tweepy.OAuthHandler(ConsumerKey, ConsumerSecret)
api = tweepy.API(auth)
text = " "

tweets = api.user_timeline(screen_name="maiconkusterkkk",
                           count=1000,
                           include_rts=False,
                           tweet_mode="extended")
for tweet in tweets:
    #print(tweet.full_text)
    text = text + " " + tweet.full_text

wordcloud = WordCloud(width=1920, height=1200)
STOPWORDS.update(["hppt", "https", "co", "da","de","em","na","se","às","como","que", "para", "os", "dos", "das", "assim", "quais","feira","um", "uma", "mais", "ao", "por","pelo","pela",\
    "como", "nosso", "nossa", "zu", "das", "zu","die","der","dem","und","auf","ein","nicht","von","wie","wird", "daß", "dass","mit","für", "Sie","sie","er","noch","vor","ist", "bei",\
    "wenn", "sich", "den", "hat", "des", "diese", "diesen", "dieses", "dieser", "über", "eine", "einer", "einen", "eines", "auch", "es", "werden", "auch", "im", "als", "uns", "sehr",\
    "aber", "einem", "zur", "nun", "mehr", "zum", "durch", "sind", "kann", "man", "aus", "nur", "haben", "will", "é" ])
wordcloud.generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
예제 #26
0
from typing import List, Set, Iterable, Dict, Pattern
from arabic_reshaper import arabic_reshaper
from bidi.algorithm import get_display
from wordcloud import WordCloud, STOPWORDS
from os.path import dirname, join
from os import environ
from hazm import Normalizer, word_tokenize
import re
from sys import version

from wordcloud.tokenization import unigrams_and_bigrams, process_tokens

FILE = dirname(__file__)
STOPWORDS.update(
    map(str.strip,
        open(join(FILE, 'stopwords'), encoding="utf8").readlines()))
FONT_PATH = environ.get('FONT_PATH', join(FILE, 'Fonts', 'font.ttf'))


class WordCloudFa(WordCloud):
    """
    This is a wrapper around WordCloud module for working with Farsi and Arabic words plus English words.
    For reading about parameters you can read `WordCloud` documents at:
    https://github.com/amueller/word_cloud/blob/d36f526e3d8346e6d7a2656631f05f68e402517d/wordcloud/wordcloud.py#L150
    There are two additional parameters in this class those are not in the WordCloud published module:
    :param include_numbers if be True, all English, Persian and Arabic numbers will exclude from conting and showing
    :param persian_normalize if be True, all words will normalize using `hazm` normalizer. for more info see:
    https://github.com/sobhe/hazm
    If you don't pass stopwords, default stopwords in the `stopwords` file will consider.
    """
예제 #27
0
        words = song.lower().split()
        corpus.append(words)
    return corpus


def wordCloud(corpus):
    flatten = sum(corpus, [])
    wordsString = " ".join(flatten)
    wordcloud = WordCloud(max_words=2000).generate(wordsString)
    # g=wordcloud.to_image()
    # g.show()
    return wordcloud


# Ed-sheeran wordcloud
STOPWORDS.update(('oh', 'na na', 'la la', 'la', 'oh oh', 'well'))

df1 = createDataFrame("ed-sheeran.json")
corpus1 = createCorpus(df1)
wc1 = wordCloud(corpus1)

# Taylor Swift wordcloud
df2 = createDataFrame("taylor-swift.json")
corpus2 = createCorpus(df2)
wc2 = wordCloud(corpus2)

df3 = df1.append(df2)
text_corpus = createCorpus(df3)

num_features = 150
예제 #28
0
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

with open("Teil_05_Alice_in_wonderland.txt") as f:
    text = f.read()

wordcloud = WordCloud(width=1920, height=1200)
STOPWORDS.update(['said', 'illustration'])
wordcloud.generate(text)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
예제 #29
0
#!/usr/bin/env python3

import argparse
import pathlib
import random

import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

stopwords = STOPWORDS.update(
    ["self", 'item', 'refdef', 'http', 'https', 'null'])

# def random_color_func(
#     word, font_size, position, orientation, random_state=None, **kwargs
# ):
#     h = int(360.0 * 143.0 / 255.0)
#     s = int(77.0 * 255.0 / 255.0)
#     l = int(100.0 * float(random.randint(44, 100)) / 255.0)
#     return "hsl({}, {}%, {}%)".format(h, s, l)


def main():
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("--filename",
                        "-f",
                        dest="filename",
                        required=True,
                        help="Markdown/text file")
    parser.add_argument(
        "--save_dir",
        "-s",
예제 #30
0
from os import path
from wordcloud import WordCloud, STOPWORDS

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, '5-comp.windows.x.txt')).read()

# Generate a word cloud image
new_stopwords = {'one', 'well', 'don', 'will', 'also'}
STOPWORDS.update(new_stopwords)
wordcloud = WordCloud(max_font_size=40).generate(text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

# lower max_font_size
# wordcloud = WordCloud(max_font_size=40).generate(text)
# plt.figure()
# plt.imshow(wordcloud, interpolation="bilinear")
# plt.axis("off")
plt.show()
url ="https://www.anchour.com/portfolio/logofolio/"
headers = {'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.3'}

req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req).read()

soup = BeautifulSoup(data, "html.parser")

main_content = soup.find("main", attrs= {"class" : "main-content"})
lists = main_content.find_all("section")

str = ""
for list in lists:
    info= list.text
    str+=info

STOPWORDS.update(["see","common"])

mask = np.array(Image.open("./cloud-icon.png"))

color= ImageColorGenerator(mask)

wordcloud = WordCloud(width=2200, height=2000,    max_words=100,mask=mask, stopwords=STOPWORDS, background_color="white", random_state=42).generate(str)

plt.imshow(wordcloud.recolor(color_func=color),interpolation="bilinear")
plt.axis("off")
plt.show()

wordcloud.to_file("./wordcloud.png")
import tweepy
from Twitter_Credentials import *
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

auth = tweepy.OAuthHandler(ConsumerKey, ConsumerSecret)
api = tweepy.API(auth)
text = ''

tweeds = api.user_timeline(screen_name='realDonaldTrump',
                           count=100, include_rts=False, tweet_mode='extended')
for tweed in tweeds:
    text = text + ' ' + tweed.full_text

wordcloud = WordCloud(width=1920, height=1200)
STOPWORDS.update(['https', 'co'])
wordcloud.generate(text)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
예제 #33
0
coff_rev = [i for i in test if any(w in i[u'reviewText'].lower() for w in wanted)]
coff_rev_ex = [i for i in coff_rev if not any(w in i[u'reviewText'].lower() for w in unwanted)]
coff_rev_espre = [i for i in coff_rev_ex if any(w in i[u'reviewText'].lower() for w in esprewanted)]


## review to text
def revew2revtext(rev_data):
    output_text = ''
    for i in range(len(rev_data)):
        rev = rev_data[i]['reviewText']
        rev = str(rev)
        output_text = output_text + rev
    return output_text

coff_rev_espre_text= revew2revtext(coff_rev_espre)


## word cloud plot
from os import path
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

text = coff_rev_espre_text

wordcloud = WordCloud(font_path='/Library/Fonts/AppleSDGothicNeo-ExtraBold.otf',width=800, height=800,stopwords=STOPWORDS.update(["buy","use", "reviewerID", "helpful"])).generate(text)


plt.imshow(wordcloud)
plt.axis("off")
plt.savefig('test2.png')
plt.show()