Exemplo n.º 1
0
Arquivo: ch01.py Projeto: gree2/hobby
def fun10():
    """frequency distribution"""
    fdist1 = FreqDist(text1)
    # print fdist1
    vocabulary1 = fdist1.keys()
    # print vocabulary1[:50]
    fdist1.plot(50, cumulative=True)
Exemplo n.º 2
0
def main():
    index = get_index("index.data")

    results = bfs('Obama', 'GAB', index)
    
    print_results(results)
    fdistAB = FreqDist([rel.A() for rel in results] + [rel.B() for rel in results])
    fdistAB.plot(10)
Exemplo n.º 3
0
def main():
        argparser = argparse.ArgumentParser(description='text file')
        argparser.add_argument('file', type=str, help='file to produce frequency distribution for')
        args = argparser.parse_args()
        
	#toker = WhitespaceTokenizer()

	f = open(args.file)
	text = f.read()
	print(text)
	fdist = FreqDist(text)
	print(fdist.freq('28') * 100)
	fdist.plot()
Exemplo n.º 4
0
def testFunc():
    fw = open("./MZI/data.doc", "r", encoding="utf8");
    text = fw.read();
    tockens = getWordList(text)
    print(len(set(tockens)))
    from nltk.probability import FreqDist
    from nltk.util import bigrams
    fdist = FreqDist(w for w in tockens if len(w) > 1);
    fdist.tabulate(50);
    big = list(bigrams(w for w in tockens if len(w) > 1));
    print(big[:100]);
    fdist = FreqDist(str(w) for w in big);
    fdist.tabulate(10);
    fdist.plot(50)
Exemplo n.º 5
0
    def create_enhanced_dale_chall_list(self):
        #list of sites used to create list of most frequent words 
        alexa_list = ['Google', 'Facebook', 'YouTube', 'Yahoo!', 'Wikipedia', 'Microsoft', 'Amazon', 'Twitter', 'LinkedIn', 'Wordpress', 'Ebay', 'Apple', 'Paypal', 'Imdb', 'Tumblr', 'Disney', 'BBC', 'Livejasmin', 'Craigslist', 'Ask']
    
        #bring all privacy texts into one list
        corpus = []
        data = get_all_policies()
        for site in data:
                if site in alexa_list:
                    corpus.append(data[site]["text"])
        
        #get the words of this list into a list of words
        t = textanalyzer("eng")
        words = t.getWords("".join(corpus))
        
        #open the dale chall wordlist        
        dale_chall_list = open('../nltk_contrib/dale_chall_wordlist.txt').read().split(';')
        
        #create a text that consists of the words of the 20 privacy policies and delete all words that are on the dale-chall list of easy words
        new_corpus = []
        
        for word in words:
            if word.lower() not in dale_chall_list and word not in alexa_list:
                new_corpus.append(word.lower())
        
        #create a frequency distribution of the words of this list of words
        fdist = FreqDist(new_corpus)
        #plot this
        fdist.plot(80, cumulative=True)
        
        #make a list of the words that make up 33% percent of the words that are not in the dale chall list (cummulative)
        most_frequ = []
        cum_percentage = 0.0
        for sample in fdist:
            cum_percentage += fdist.freq(sample)
            most_frequ.append(sample)
            if cum_percentage > 0.33:
                break

        #write those into a file
        privacy_file = open("privacy_wordlist.txt", "w")
        privacy_file.write(";".join(most_frequ))
Exemplo n.º 6
0
def genre_properties(sorted_genres, data):
    # create empty genre_tokens dict to hold genre name:token lists
    genre_tokens = {}
    # boolean check to see if we have already gone through and tokenized everything
    files_exist = os.path.isfile("data/top_genres.txt")

    # initialize the genre_tokens keys outside of the if statement so that it can be used in both cases
    for i in range(0, 5):
        # keys are simply the top 5 genres
        genre_tokens[sorted_genres[i]] = []

    # if for some reason the data files don't exist, lets go through the process of creating them (takes ~3 minutes)
    if not files_exist:
        print(
            "\nThe data files don't exist, beginning tokenization process, grab some coffee..."
        )

        # grab the nltk corpus stopwords
        stopWords = set(stopwords.words('english'))
        # add in some extra noise words we don't care about (I probably missed a couple)
        noiseWords = [
            "{{Expand section}}", ",", ".", "(", "[", "{", ")", "]", "}", ":",
            ";", "&", "'", '"', "'s", "``", "''", "n't", "`", '’'
        ]

        # store the start time so we can keep track of how long this process takes
        t1 = time.time()

        # iterate through the dataset, this is largerly the same structure as in top_genres so I won't repeat comments
        for row in data.itertuples(index=True):
            # strip the genre string of quotes and brackets
            genre_str = str(getattr(row, 'genres'))
            genre_str = genre_str[1:-1]
            genre_str = genre_str.replace('"', '')

            # don't need to do any trimming on summary strings like we did for genre strings
            summary_str = str(getattr(row, 'summary'))

            # tokenize the summary string
            tokens_raw = word_tokenize(summary_str)

            # create an empty token list we will fill with filtered tokens
            tokens_processed = []
            # filter the raw token list
            for word in tokens_raw:
                # we only care about words not in the stopWords or noiseWords list
                if word not in stopWords and word not in noiseWords:
                    tokens_processed.append(word)

            # for each of the film's genres...
            for genre in genre_str.split(', '):
                # if the genre is in the top 5 genres genre_tokens dict...
                if genre in genre_tokens:
                    # extend the film's filtered tokens to the end of genre_token's token list for the current genre
                    genre_tokens.get(genre).extend(tokens_processed)

        # grab the stop time and alert the user of progress
        t2 = time.time()
        print("Tokenization completed in " + str(t2 - t1) + " seconds.\n")

        # to make sure we never have to do that again, lets store all of our data in some .txt files
        # first lets store the top 5 genres in the file "top_genres.txt", one genre per line
        top_genres_file = open("data/top_genres.txt", "w")
        for i in range(0, 5):
            # grab the genre name
            genre = sorted_genres[i]
            # write it to a line with a newline break
            top_genres_file.write("%s\n" % genre)

            # using that genre name create a file "genre.txt" where we will store all of that genre's tokens list
            genre_file = open("data/%s.txt" % genre, "w")
            # for all of the tokens in that genres value pair from our genre_tokens dict...
            for token in genre_tokens.get(genre):
                # write each token on a newline
                genre_file.write("%s\n" % token)

            # close our genre file inside the for loop since we will use the same variable for all 5 genre files
            genre_file.close()

        # finally close the top_genres file
        top_genres_file.close()

    # in this case the data files already exist and we don't need to do any tokenization, this should be the normal case
    else:
        print("\nThe data files exist, beginning token loading:")
        # first open the file with the top 5 genres listed
        top_genres_file = open("data/top_genres.txt", "r")

        # iterate over each line in the file
        for index, line in enumerate(top_genres_file):
            # initialize the genre_tokens dict with the top 5 genres as keys, and empty lists for tokens as values
            genre_tokens[sorted_genres[index]] = []

        # close the top genres file for memory
        top_genres_file.close()

        # iterate over each of the genre keys in our genre_tokens dict that we just loaded
        for genre in genre_tokens:
            print("Loading the " + str(genre) + ".txt file...")
            # open the associated file for each genre
            genre_file = open("data/%s.txt" % genre, "r")

            # iterate over each line of the file
            for index, line in enumerate(genre_file):
                # trim the new line characters from the line
                trimmed_line = line.replace("\n", "")
                # append the line (token) to the corresponding token list for the current genre in our genre_tokens dict
                genre_tokens.get(genre).append(trimmed_line)

            # close our files
            genre_file.close()

        # we are now done loading in our data files and can proceed with addressing the genre characterization
        print("Done loading!\n")

    # at this point, we now have a genre_tokens dict with complete summary tokens lists for each genre
    # lets create a dict, genre_fdicts, that will store the genre:freq dist pairs for each genre
    genre_fdists = {}
    print("Creating frequency distributions for each genre:")
    # for each of our top genres
    for genre in genre_tokens.keys():
        # print out the genre and number of tokens it has
        print("Total " + str(genre) + " tokens to consider: " +
              str(len(genre_tokens.get(genre))) + "...")

        # calculate the Frequency Distribution of all the genres tokens
        fdist = FreqDist(genre_tokens.get(genre))
        # add the genre:freqdist pair to our genre_fdists dict
        genre_fdists[genre] = fdist

        # next lets do some plotting of the top 50 most frequent tokens
        fig_path = str("plots/%s_fdist.png" % genre)
        # we only want to handle plotting if for some reason the plots don't exist
        if not os.path.isfile(fig_path):
            # alert the user of what's happening since matplotlib allows the user to specify bounds through a GUI
            print(
                str(genre) +
                " FreqDist plot does not exist, creating and displaying it now..."
            )
            # alert the user of how to save the plot so this process is no longer run
            print(
                "To skip this process in the future, save the figure as `plots/Genre Name_fdist`"
            )

            # plot the top 50 freq dist samples
            fdist.plot(50, cumulative=True)

    print("Done calculating frequency distributions!\n")

    # now that we have the freq dists for each genre, lets do some more analytics
    # to begin lets find & store the common set of tokens that is shared between all genre's top 50 freq dist samples
    common_set = []
    print(
        "Finding the common set of words across all genre frequency distributions:"
    )
    # loop through each of the top genres
    for i in range(0, 5):
        # grab the current genre name from our sorted_genres list that was passed into this function
        genre = sorted_genres[i]
        # find the top 50 most common samples in our current genres freq dist
        top_current = genre_fdists[genre].most_common(50)

        # initialize an empty temporary list that will overwrite our common_set list
        new_commons = []

        print("Now computing common set additions from " + str(genre) +
              " genre...")
        # inner loop to compare each genre against every other genre O(n^2), further work should be done to improve this
        for j in range(0, 5):
            # if the genre we are comparing against is the current genre then skip it
            if j is i:
                continue

            # grab the genre to compare against's name
            compare_genre = sorted_genres[j]
            # grab the compare genre's frequency distribution
            top_compare_raw = genre_fdists[compare_genre].most_common(50)

            # since the most_common function returns a tuple (sample, count) lets strip out just the sample
            # note: there may be some nltk methods to do this, but I couldnt find any in the documentation
            top_compare_filtered = []
            # for all the sample tuples...
            for sample in top_compare_raw:
                # grab just the sample name
                top_compare_filtered.append(sample[0])

            # now lets actually compare the current genre's samples against the compare genre's samples
            for sample in top_current:
                # if the current sample name is in the top 50 sample names from the compare genre...
                if sample[0] in top_compare_filtered:
                    # then add it to the new_commons list
                    new_commons.append(sample[0])

            # now we need to update the common_set list with samples that aren't already in it
            for sample in new_commons:
                # if the sample from new_commons doesn't exist in the common_set list...
                if sample not in common_set:
                    # then add the sample to common_set
                    common_set.append(sample)

    # we have no computed a comman set of words shared in some combination across the top genres
    print("A common set has been found! Across all genres " +
          str(len(common_set)) + " words are shared, they are:")
    print(common_set)

    # now lets find the unique set of words for each genre, this should give us an insight into genre characteristics
    print("\nComputing the unique sets for each genre...")
    # for each genre
    for genre in genre_tokens.keys():
        # initialize an empty unique set list
        unique_set = []
        # grab the top 50 most common words for the genre from its frequency distribution
        top_current = genre_fdists[genre].most_common(50)

        # go through each of the top 50 words
        for sample in top_current:
            # and if any of the top 50 words arent in the common set...
            if sample[0] not in common_set:
                # add it to the unique set
                unique_set.append(sample[0])

        # finally for each genre print out that genre's name and unique set
        print(str(genre) + "'s unique set: " + str(unique_set))
Exemplo n.º 7
0
    
    
stop_words = stopwords.words('english')
#the two most common contractions that aren't in stopwords.words,
#as well as words used to denote the sections of a song
stop_words.extend(['im','ill','verse','hook','chorus','bridge']) 
stop_words = set(stop_words) #element removal is faster using set than list

word_tokens = nltk.word_tokenize(str(eastlyrics_punct))
word_tokens = [w.lower() for w in word_tokens]

allwords = [w for w in word_tokens if w not in stop_words]

fdeast = FreqDist(allwords)

fdeast.plot(20, cumulative = False)





word_tokens = nltk.word_tokenize(str(southlyrics_punct))
word_tokens = [w.lower() for w in word_tokens]

allwords = [w for w in word_tokens if w not in stop_words]

fdsouth = FreqDist(allwords)

fdsouth.plot(20, cumulative = False)

Exemplo n.º 8
0
from nltk.probability import FreqDist

with open('christ-and-satan.txt') as f:
    cs_text = f.read()

word_list = cs_text.split()
first_letter = [word[0] for word in word_list if word[0].isalpha()]
letter_dist = FreqDist(first_letter)
letter_dist.plot(4,cumulative=True)
Exemplo n.º 9
0
# nltk.download('averaged_perceptron_tagger')

# Sample
# print(nltk.pos_tag(flat_word_token[0:10]))
# print(nltk.pos_tag(flat_sent_token[0:10]))


##### Find Frequency Distribution ######

# Find frequency of words
fdist_word = FreqDist(words)
fdist_word.most_common(50)

# Plot Frequency Graph
fdist_word.plot(50)

# Find frequency of sentence
fdist_sent = FreqDist(sents)
fdist_sent.most_common(10)   # TELLING

# Plot Frequency Graph (sentence)
fdist_sent.plot(10)

# Frequency of (Word) STEMS
fdist_stem_word = FreqDist(stems)
fdist_stem_word.most_common(50)

# Frequency of (Word) LEMMAS
fdist_lemmas_word = FreqDist(lemmas)
fdist_lemmas_word.most_common(50)
Exemplo n.º 10
0
 def fdistribution(self, tokenized_words):
     fdist = FreqDist(tokenized_words)
     fdist.plot(30, cumulative=False)
     plt.show()
Exemplo n.º 11
0
def FreqDistPlot(data, show=10):
    fdist1 = FreqDist(data)
    fdist1.plot(show, cumulative=True)
Exemplo n.º 12
0
# -*- coding: utf-8

from nltk.probability import FreqDist
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import LineTokenizer

FIRST = 0
END = 150

corpus_root = './data'
fileids = 'data_title_sample'

wordlists = PlaintextCorpusReader(corpus_root,
    fileids,
    sent_tokenizer=LineTokenizer(),
    encoding='utf-8')

tokens = []
for word in wordlists.words() :
  try :
    tokens += [ word.lower() ]
  except :
    pass

fdist = FreqDist(tokens)

fdist.plot(FIRST,END)

for k,v in fdist.items() :
  print "{} {}".format(k.encode("utf-8"),v)
Exemplo n.º 13
0
 def plot_html_results(self, lemmatized_list_by_verb_noun_adj_adv, number_of_cat):
     fdist = FreqDist(w for w in lemmatized_list_by_verb_noun_adj_adv)
     fdist.plot(number_of_cat)
Exemplo n.º 14
0
# Creating main text object based on the Wall Street Journal corpora
# Setting all words to lowercase and removing non-alphabetical entrys
myText = [ word.lower() for word in text7 if word.isalpha() ]

# Creating text object based on myText, without repetitions
myTextSet = set( myText )

# Creating a frequency distribution with myText
fdMyText = FreqDist(myText)

# Creating histogram, and copying to file, in order of appearance
histogram = [ "%s - %s" % ( word, fdMyText[word] ) for word in myTextSet ]

fileObj = open("histogram.txt","w")
for wordInfo in histogram:
	fileObj.write("%s\n" % (wordInfo) )
fileObj.close()

# Creating sorted list of the most frequent words, to the less frequent words,
# of the reuters text and copying to file
sortedList = fdMyText.keys()

fileObj = open("sortedHistogram.txt","w")
for word in sortedList:
	fileObj.write("%s - %d\n" % (word, fdMyText[word]) )
fileObj.close()

# Only showing 50 most frequent words in plot because of limited monitor space
fdMyText.plot(50)
Exemplo n.º 15
0
Tokens = word_tokenize(dataset)
print(Tokens)

#No. of tokens in the dataset
len(Tokens)

#Freq of occurence of distinct elements
from nltk.probability import FreqDist

fdist = FreqDist()

for word in Tokens:
    fdist[word.lower()] += 1
fdist
fdist.plot(20)

#-------------------------Stemming----------------------------------------
from nltk.stem import PorterStemmer

pst = PorterStemmer()
pst.stem("having")

#-------------Remove the Stop Words---------------------
import nltk.corpus

#Enlisting the stopwords present in English lang
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]

#Getting rid of stopwords
def plot_freq_dist(words, num_words=20):
    '''Frequency distribution'''
    fdist = FreqDist(words)
    fdist.plot(num_words, cumulative=False)
Exemplo n.º 17
0
 def get10TopKeyWords(self, tabWord):
     allWordDist = FreqDist(tabWord)
     allWordDist.plot(20)
Exemplo n.º 18
0
    return 100 * count / total


print(lexical_diversity(text3))
print(lexical_diversity(text5))
print(percentage(4, 5))
print(percentage(text4.count('a'), len(text4)))
# %%
fdist1 = FreqDist(text1)
fdist1
vocabulary1 = fdist1.keys()
print(vocabulary1)
print(fdist1['whale'])

# %%
fdist1.plot(50, cumulative=True)

# %%
list(fdist1.items())[0:5]

# %%
fdist1.freq('monstrous')

# %%
# Total number of samples
fdist1.N()

# %%
fdist1

# %%
Exemplo n.º 19
0
A = set(allwords)
longwords = [w for w in A if len(w) > 12]  #单词长度>12的所有单词
print(sorted(longwords))

from nltk.probability import FreqDist, ConditionalFreqDist
"""
FreqDist: 创建一个所给数据的频率分布
B(): 不同单词的个数
N(): 所有单词的个数
tabulate(20): 把前20组数据以表格的形式显示出来
fd2.plot(20,cumulative=True): 参数cumulative 对数据进行累计 
"""
fd2 = FreqDist([sx.lower() for sx in allwords if sx.isalpha()])
print("不同单词的个数:%d" % fd2.B())
print("所有单词的个数:%d" % fd2.N())
fd2.tabulate(20)  #把前20组数据 以表格的形式显示出来
fd2.plot(20)
fd2.plot(20, cumulative=True)
"""
freq('the')  #单词the出现的频率
ConditionalFreqDist( ): 条件频率统计的函数,研究类别之间的系统性的差异
"""
from nltk.corpus import inaugural
print(fd2.freq('the'))  #单词the出现的频率
cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids()
                          for w in inaugural.word(fileid)
                          if fileid > '1980' and fileid < '2010')
print(cfd.items())
cfd.plot()
                   raw_data_lyrics,
                   left_on='Track_Name',
                   right_on='Track.Name')
del dataset['Track.Name']
dataset['Lyrics'] = dataset['Lyrics'].astype(str)
dataset
dataset = dataset.drop(dataset.index[[30, 22]])
dataset['Lyrics'] = dataset['Lyrics'].str.lower().replace(r'\n', ' ')
dataset['Lyrics']
tokens = dataset['Lyrics'].fillna("").map(nltk.word_tokenize)
allWords = []
for wordList in tokens:
    allWords += wordList

fdist = FreqDist(allWords)
fdist.plot(30, cumulative=False)
plt.show()
stop_words_en = set(stopwords.words("english"))
stop_words_es = set(stopwords.words("spanish"))
punctuations = list(string.punctuation)
allWords = [i for i in allWords if i not in punctuations]
forbidden = [
    'oh', "'s", 'yo', "'ll", 'el', "'re", "'m", "oh-oh", "'d", "n't", "``",
    "ooh", "uah", "'em", "'ve", "eh", "pa", "brr", "yeah"
]
filtered_sent = []
for w in allWords:
    if (w not in stop_words_en) and (w not in stop_words_es):
        filtered_sent.append(w)
filter_ = []
for w in filtered_sent:
Exemplo n.º 21
0
alice_mask = np.array(Image.open('c:\\Temp\\alice.jpg'))
wc = WordCloud(font_path='c:\\windows\\fonts\\NanumGothic.ttf',
               relative_scaling=0.2,
               mask=alice_mask,
               background_color='white',
               min_font_size=1,
               max_words=2000).generate_from_frequencies(tmp_data)
plt.figure(figsize=(8, 8))
plt.imshow(wc)
plt.axis('off')
plt.show()

# 그래프로 그리기
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import matplotlib
# import matplotlib.rc as rc

# font_location = 'C:\\Windows\\Fonts\\gulim.ttc'
# font_name = fm.FontProperties(fname=font_location).get_name()
# matplotlib.rc('font', family='font_name')

import nltk
plt.figure(figsize=(20, 4))

from nltk.probability import FreqDist
g_data4 = FreqDist(data3)

g_data4.plot(50)

#*******************************************************************************
# Question: Are there differences between word-length frequencies of converted
#      vs. unconverted requests?
# Answer: No
# Correlation Coefficient: 
#*******************************************************************************
print('Begin calculating word length frequencies...')
      
cnvtText = ' '.join([item['request_text'] for item in data
                     if len(item['request_text'])>0
                     and item['requester_received_pizza']==1])
wl1 = [len(word) for word in nltk.word_tokenize(cnvtText) if word.isalpha()]
wl1fd = FreqDist(wl1)
if graphs == 'yes': wl1fd.plot()
## 4, 3, 2, 5, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 18
print('...Word length frequencies for successful requests have been plotted.')

uncnvtText = ' '.join([item['request_text'] for item in data
                     if len(item['request_text'])>0
                     and item['requester_received_pizza']==0])
wl2 = [len(word) for word in nltk.word_tokenize(uncnvtText) if word.isalpha()]
wl2fd = FreqDist(wl2)
if graphs == 'yes': wl2fd.plot()
## 4, 3, 2, 5, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 17, 35, 20
print('...Word length frequencies for unsuccessful requests have been plotted.')



#*******************************************************************************
Exemplo n.º 23
0
sentence_tokenize = sent_tokenize(text)
w_tokenize = word_tokenize(text)
print(sentence_tokenize)
print("\n")
print(w_tokenize)
#finding freq of each word
from nltk.probability import FreqDist

fdis = FreqDist(w_tokenize)
print(fdis)
a = fdis.most_common(2)
print(a)
#plotting each word and its frequency
import matplotlib.pyplot as plt

fdis.plot(30, cumulative=True)
plt.show()
#listing all stopwords
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
print(stop_words)
#removing stopwords
refined_sent = []
for w in w_tokenize:
    if w not in stop_words:
        refined_sent.append(w)
print(refined_sent)
#Converting ino stem words
from nltk.stem import PorterStemmer
Exemplo n.º 24
0
    len(tokenized_word_without_Stopwords)))

# # Frequency Distribution

# In[80]:

from nltk.probability import FreqDist
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

# In[81]:

# Frequency Distribution Plot
fdist = FreqDist(tokenized_word_without_Stopwords)
print(fdist)
fdist.plot(50, cumulative=False)  #plot 50 high frequency words
plt.figure(figsize=(50, 50))
plt.show()

# # POS tagging (Parts of Speech)

# In[38]:

import nltk
#nltk.download('averaged_perceptron_tagger')

# In[116]:

tagged = nltk.pos_tag(tokenized_word_without_Stopwords)  #need to use split()
tagged[0:20]
Exemplo n.º 25
0
    tagged = nltk.pos_tag(tokens)
    adj = [w for w, t in tagged if 'JJ' in t]
    return adj


# Extracting only 'noun' words
train['Noun'] = train['Cleaned_str'].apply(find_noun)
text_noun = train['Noun'].apply(' '.join)
text_noun = ' '.join(text_noun)
text_noun = text_noun.split()  # to list form
len(text_noun)  # 710916

# Frequency of commonly used noun words
Freq_words = FreqDist(text_noun)
Freq_words.most_common(60)
Freq_words.plot(30)

# Extracting only 'adjectives' words
train['Adjective'] = train['Cleaned_str'].apply(find_adj)
text_adj = train['Adjective'].apply(' '.join)
text_adj = ' '.join(text_adj)
text_adj = text_adj.split()  # to list form
len(text_adj)  # 349119

# Frequency of commonly used adjectives words
Freq_words = FreqDist(text_adj)
Freq_words.most_common(60)
Freq_words.plot(30)

# Most frequently used words
Freq_words = FreqDist(text_clean)
tokens = [word.replace(',', '') for word in tokens]
tokens = [word for word in tokens if ('*' not in word) and \
("''" != word) and ("``" != word) and \
(word!='description') and (word !='dtype') \
and (word != 'object') and (word!="'s")]
print("\nDocument contains a total of", len(tokens), " terms.")
token_num = FreqDist(tokens)
for pos, frequency in token_num.most_common(20):
    print('{:<15s}:{:>4d}'.format(pos, frequency))

#POS Tagging
tagged_tokens = nltk.pos_tag(tokens)
pos_list = [word[1] for word in tagged_tokens if word[1] != ":" and \
word[1] != "."]
pos_dist = FreqDist(pos_list)
pos_dist.plot(title="Parts of Speech")
for pos, frequency in pos_dist.most_common(pos_dist.N()):
    print('{:<15s}:{:>4d}'.format(pos, frequency))

# Removing stop words
stop = stopwords.words('english') + list(string.punctuation)
stop_tokens = [word for word in tagged_tokens if word[0] not in stop]
# Removing single character words and simple punctuation
stop_tokens = [word for word in stop_tokens if len(word) > 1]
# Removing numbers and possive "'s"
stop_tokens = [word for word in stop_tokens \
if (not word[0].replace('.','',1).isnumeric()) and \
word[0]!="'s" ]
token_dist = FreqDist(stop_tokens)
print("\nCorpus contains", len(token_dist.items()), \
" unique terms after removing stop words.\n")
Exemplo n.º 27
0
from nltk import word_tokenize, Text
from nltk.probability import FreqDist

tokens = ""

with open(u'monte_cristo.txt', 'r', encoding="utf8") as con:
    contents = con.read()
    tokens = word_tokenize(contents)
	
processed = Text(tokens)
fdist = FreqDist(processed)
processed.collocations()
print(fdist.most_common(50))
fdist.plot(50)
# Compute the Percentage of Hapax Legomena's Occurrences and the longest in them
hapax_legomenas = fdist.hapaxes() # Get the list of words that appeared just once in corpus
hapax_legomena_counts = len(hapax_legomenas) # Get the count of them
percentage_of_hapax_legomena = (hapax_legomena_counts/no_of_tokens)*100 # Compute percentage
print("Percentage of Hapax Legomena Occurrences", percentage_of_hapax_legomena)
max_len_happax_legomena = max([len(word) for word in hapax_legomenas])
print("Longest happax Legomena's are", [word for word in hapax_legomenas if len(word) == max_len_happax_legomena])

# Compute the Percentage of dis legomena Occurrences and the longest in them
dis_legomenas = [key for key, value in fdist.items() if value == 2] # Get the words that occurred just twice
dis_legomena_counts = len(dis_legomenas) * 2 # Get their counts
percentage_of_dis_legomena = (dis_legomena_counts/no_of_tokens)*100 # Compute percentage
print("Percentage of Dis Legomena Occurrences", percentage_of_dis_legomena)
max_len_dis_legomena = max([len(word) for word in dis_legomenas])
print("Longest Dis Legomena's are ", [word for word in dis_legomenas if len(word) == max_len_dis_legomena])

# Plot the r vs Nr graph
fdist.plot(50)

# Compute the log scaled version of r vs Nr
log_rvsNr = {log(key):log(value) for key, value in (fdist.r_Nr()).items() if value!=0}

# Plot the graph of log(r) vs log(Nr)
plot.plot(log_rvsNr.keys(), log_rvsNr.values(), 'r.')
plot.axis([-1, 11, -1, 11])
plot.xlabel('log(r)')
plot.ylabel('log(Nr)')
plot.title('log(r) vs log(Nr) Brown Corpus')
plot.show()

Exemplo n.º 29
0
        stemmed_word=stemmer.stem(word) #stem the word
        stemmed_words.append(stemmed_word) 
    return stemmed_words

sw=stem_words(rs)

#frequence d'utilisation d'un mot
fdist = FreqDist(rs)

frequency_frame = pd.DataFrame(fdist.most_common(30),
                                        columns=["mots", "frequences"])


# Frequency Distribution Plot
import matplotlib.pyplot as plt
fdist.plot(30)
plt.show()

#2 eme etape recuperer les mots les plus utilisés : 
print(fdist.most_common(30))



dico = {}
for key, value in fdist.most_common(50):
    if key not in dico:
        dico[key] = [value]
    else:
        dico[key].append(value)
print (dico) 
Exemplo n.º 30
0
def print_freq_dist(in_tokens):
    "plot the frequency distributions for tokens in in_tokens"

    text = nltk.Text(in_tokens)
    fdist = FreqDist(text)
    fdist.plot(100, cumulative=False)
Exemplo n.º 31
0
rawt1 = re.sub(r'(www.[a-z]*.[a-z]*)', '', rawt1)
# removing digits
rawt1 = re.sub(r'[\d]*', '', rawt1)
# removing chapter names
rawt1 = re.sub(r'(i|ii|iii|iv|v|vi|vii|viii|ix|x|xi|xii)\.[ _a-z:]*', '',
               rawt1)
# removing punctuations
rawt1 = remove_punctuation(rawt1)
t1_tokenized = word_tokenize(rawt1)
counts = Counter(t1_tokenized)
print("Number of distinct words " + str(len(counts)))
print("Number of tokens " + str(len(t1_tokenized)))
print("Number of characters " + str(len(rawt1)))
print(t1_tokenized)
fdist = FreqDist(t1_tokenized)
fdist.plot(30, cumulative=False)
plt.show()

# In[2]:

word_cloud_dict = Counter(t1_tokenized)
wordcloud = WordCloud(
    width=1000, height=1000, background_color='white',
    stopwords=None).generate_from_frequencies(word_cloud_dict)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

# In[3]:
            for w in filtered_sent:
                root_words.append(ps.stem(w))
        if ln_choice == "lem":
            for w in filtered_sent:
                root_words.append(lem.lemmatize(w))
        # Remove integers only
        no_integers = [
            x for x in root_words
            if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())
        ]
        # Frequency dstribution of words in text and plotting data
        fdist = FreqDist(no_integers)
        fig = plt.figure(figsize=(10, 5))
        plt.gcf().subplots_adjust(bottom=0.25)  # to avoid x-ticks cut-off
        fdist.plot(30,
                   cumulative=False,
                   title="Top 30 most common words in cluster {}".format(c))
        plt.show()
        fig.savefig(pathc +
                    "/Most common words in louvain cluster {}_{}_lexicon.pdf".
                    format(c, ln_choice))

# tokenize the text and plot fre dist plots
if c_choice == 'k':
    for c in (sorted(plt_data["l_clusters"].unique())):
        c_df = plt_data[plt_data.k_clusters == c]
        small_df = c_df[["paper_id", "title"]]
        small_df.to_csv(pathcdf +
                        "/papers in k-means cluster {}.csv".format(c),
                        index=False)
        # Filter the text
Exemplo n.º 33
0
def graph_word():
    fdist = FreqDist(word_tokenize(read_file))
    print(fdist)
    fdist.plot(101, cumulative=False)
    plt.show()
Exemplo n.º 34
0
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
import re as r
from nltk.corpus import stopwords

text = "Hello Mr Smith how are you doing today? The weather is great and city is awesome. The sky is pinkish-blue. You shouldn't eat cardboard"
tokenized_text = sent_tokenize(text)
text1 = text.lower()

freqword = FreqDist(wordtokenize)

freqword.most_common(6)

freqword.plot(20, cumulative=False)

#removing punctuation

rptext = r.sub('[^\w\s]+', '', text1)
wordtokenize = word_tokenize(rptext)

rpwordtoken = word_tokenize(rptext)

#stopword
#import nltk
#nltk.download("stopwords")

swords = set(stopwords.words('english'))
swords
Exemplo n.º 35
0
def plot_words(wordList):
    fDist = FreqDist(wordList)
    #print(fDist.most_common())
    print("单词总数: ", fDist.N())
    print("不同单词数: ", fDist.B())
    fDist.plot(10)
Exemplo n.º 36
0
    count = 0
    for category in good_word_dict_2:
        if (any(map(lambda word: word in sentence,
                    good_word_dict_2[category]))):
            count += 1
            continue
    if count == 2:
        sentence_list.append(str(sentence))

data = '\n'.join(sentence_list)

words = word_tokenize(data)

# Pre-cleaning
spread = FreqDist(words)
spread.plot(50)
for word, freq in spread.most_common(100):
    print(u'{};{}'.format(word, freq))

# Cleaning
words = [w.lower() for w in words if w.isalpha()]
stop_words = set(stopwords.words('english'))
words_clean = [w for w in words if w not in stop_words]

# Post-Cleaning
spread = FreqDist(words_clean)
spread.plot(50)

f = open(
    "C:\\Users\\satvi\\Documents\\GitHub\\HIselector\\Satvik\\Bag of Words\\relevant_sentences.txt",
    "w",
Exemplo n.º 37
0
def drawFreqMap(words):
    fdist = FreqDist(words)
    fdist.plot(20)
Exemplo n.º 38
0
content = open(file_name, 'rb').read()

cutedText = " ".join(jieba.cut(content))
#nltkText = nltk.corpus.gutenberg.raw(cutedText)
fd = FreqDist(cutedText)
items = fd.items()
print items[:30] 
#fd.plot()
#print cutedText
print dir(cutedText)
#print dir(nltkText)
print cutedText.count(u'ÃÏ¿Ì')

tags = jieba.analyse.extract_tags(content, topK=30)
fd = FreqDist(tags)
for keyword in tags:
    print "result of ",keyword
    count = cutedText.count(keyword)
    print count
    fd[keyword] = count
    #cutedText.split().concordance(keyword)

print fd

from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
plt.xlabel(u'')
plt.ylabel(u'¥Œ ˝')
plt.title(u'')
fd.plot()
Exemplo n.º 39
0
def plot_freq_dist(dict, firsts=100, cumulative=False):
    dist = FreqDist(dict.dfs)
    dist.plot(firsts, cumulative=cumulative)
    plt.show()
Exemplo n.º 40
0
ztokenizer = nltk.RegexpTokenizer(r"\w+")
text_token = ztokenizer.tokenize(df2)

# convert to lower case
tokens = [w.lower() for w in text_token]

# remove punctuation from each word
import string
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]

# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]

# filter out stop words
sw_list = [
    'months', 'year', 'years', 'com', 'linkedin', 'linkedin', 'comwww',
    'india', 'new', 'technology', 'gmail'
]
stopword = stopwords.words('english')
#this helps in appending the code
stopword.extend(sw_list)
words = [w for w in words if not w in stopword]

#this is graphical presentation of most used words

fdist1 = FreqDist(words)
# print (fdist1)
fdist1.plot(20)
Exemplo n.º 41
0
    tokens[n] = [word for word in tokens[n] if ('*' not in word) and \
           word != "''" and word !="``"]
    # Remove punctuation
    for word in tokens[n]:
        word = re.sub(r'[^\w\d\s]+','',word)
    print("\nDocument " + str(n) +" contains a total of", len(tokens[n]),\
          " terms.")
    
# POS Tagging
tagged_tokens = {}
for n in range(1,9):
    tagged_tokens.update({n: nltk.pos_tag(tokens[n])})
    pos_list = [word[1] for word in tagged_tokens[n] if word[1] != ":" and \
                word[1] != "."]
    pos_dist = FreqDist(pos_list)
    pos_dist.plot(title="Parts of Speech: Document "+str(n))
    for pos, frequency in pos_dist.most_common(pos_dist.N()):
        print('{:<15s}:{:>4d}'.format(pos, frequency))
        

# Remove stop words
stop = stopwords.words('english') + list(string.punctuation)
stop_tokens={}
for n in range(1,9):
    stop_tokens.update({n:[word for word in tagged_tokens[n] if word[0] not in stop]})
# Remove single character words and simple punctuation
    stop_tokens[n] = [word for word in stop_tokens[n] if len(word) > 1]
# Remove numbers and possive "'s"
    stop_tokens[n] = [word for word in stop_tokens[n] \
               if (not word[0].replace('.','',1).isnumeric()) and \
               word[0]!="'s" ]
# def content_fraction(text):
#	stopwords = nltk.corpus.stopwords.words('spanish')
#	content = [w for w in text if w.lower() not in stopwords]
#	return len(content) / len(text)
# content_fraction(text)

# Step 4: stem words


# SOME INITIAL EXPLORATIONS OF THE TEXT

sorted(set(text))  	             # displays sorted unique words
fdist = FreqDist(text)           # creates a frequency distribution for words
vocabulary = fdist.keys()        # creates frequency distributions vocabularies
vocabulary[:50]                  # displays 50 most frequent words in text 
fdist.plot(50, cumulative=True)  # frequency distribution for 50 most frequent words 
text.collocations()              # common word collocations



# APPROACH 1: POINTWISE MUTUAL INFORMATION (PMI)

bigram_measures   = nltk.collocations.BigramAssocMeasures()
trigram_measures  = nltk.collocations.TrigramAssocMeasures()
#quadgram_measures = nltk.collocations.QuadgramAssocMeasures()


finder_bi   = BigramCollocationFinder.from_words(text)
finder_tri  = TrigramCollocationFinder.from_words(text)
finder_quad = QuadgramCollocationFinder.from_words(text)
Exemplo n.º 43
0
print state_union_text.count("war")
state_union_text.concordance("economy")
state_union_text.similar("economy")
state_union_text.common_contexts(["economy", "jobs"])

from nltk.probability import FreqDist

fdist = FreqDist(state_union_text)
result = fdist.most_common(15)
result


from nltk.corpus import stopwords
stopwords.words("english")


filtered = [w for w in state_union.words() if not w in stopwords.words("english")]
len(filtered)


fdist_filtered = FreqDist(filtered)
fdist_filtered.most_common(20)


fdist_filtered.freq("good")/fdist_filtered.freq("bad")
fdist_filtered.freq("bad")/fdist_filtered.freq("evil")


fdist_filtered.plot(30)