def print_results(self):
        print
        print("int-binned log-likelihood distributions:")
        ll_fdist = FreqDist(self.res["dlist_dist"])
        ll_fdist.tabulate()
        print
        print(self.res["cm"])

        print("{:<30}{:>.3%}"
                .format("Majority Class Prior Prob: ",
                   self.res["prior_probability"]))
        print("{:<30}{:>}"
                .format("Majority Class Label: ", self.majority_label))

        print
        print("{:<30}{:>.3%}"
                .format("Accuracy: ", self.res["accuracy"]))
        print("{:<30}{:>.3%}"
                .format("Error: ", self.res["error"]))
        print("{:<30}{:>.3%}"
                .format("Error Reduction / Baseline: ",
                    self.res["error_reduction"]))

        print
        print("{:<7}{:<23}{:>.3%}"
                .format(self.root_star,
                    "Precision: ",
                    self.res["root_star_precision"]))
        print("{:<7}{:<23}{:>.3%}"
                .format(self.root,
                    "Precision: ",
                    self.res["root_precision"]))
        print("{:<7}{:<23}{:>.3%}"
                .format(self.root_star,
                    "Recall: ",
                    self.res["root_star_recall"]))
        print("{:<7}{:<23}{:>.3%}"
                .format(self.root,
                    "Recall: ",
                    self.res["root_recall"]))

        print
        print("{:<30}{:>.3%}"
                .format("Macro Precision: ", self.res["macro_precision"]))
        print("{:<30}{:>.3%}"
                .format("Macro Recall: ", self.res["macro_recall"]))
        print
        print("Top Ten Rules:")
        for l in self.decision_list[:10]:
            print("{:<30}{:>.4}".format(l[0], l[1]))
        print
        print("3 Correct:")
        for l in self.res["correct"][:3]:
            print("Correctly Predicted: {} \n Rule: {}, log-likelihood: {} \n {}"
                    .format(l[0], l[2][0], l[2][1], " ".join(l[3])))
        print
        print("3 Incorrect:")
        for l in self.res["incorrect"][:3]:
            print("Predicted: {}, was actually: {} \n Rule: {}, log-likelihood: {} \n {}"
                    .format(l[0], l[1], l[2][0], l[2][1], " ".join(l[3])))
Exemplo n.º 2
0
def analyze_comments(projects):
    words = []
    word_counts = []
    word_counts_en = []
    word_counts_en_no_stop = []
    nr_projects_with_comments = 0
    for project in projects:
        project_words = project.get_comment_words()
        if len(project_words) != 0:
            words.extend(project_words)
        project_word_counts, project_word_counts_en, project_word_counts_en_no_stop = project.get_comment_word_counts(
        )
        word_counts.extend(project_word_counts)
        word_counts_en.extend(project_word_counts_en)
        word_counts_en_no_stop.extend(project_word_counts_en_no_stop)
        if len(project_word_counts) != 0:
            nr_projects_with_comments = nr_projects_with_comments + 1
    print(word_counts_en_no_stop)
    get_comment_statistics(word_counts, word_counts_en,
                           nr_projects_with_comments)
    fdist = FreqDist(words)
    fdist.most_common(20)
    fdist.plot(20, cumulative=False)
    plt.savefig("WordFreqDist.png")
    plt.savefig("WordFreqDist.pdf")
    plt.show()
    fdist.tabulate()
    return word_counts_en_no_stop
Exemplo n.º 3
0
 def plot_freq(self, corpus, patt, n):
     wordlists = PlaintextCorpusReader(corpus, patt)
     fileids = wordlists.fileids()
     words = []
     for id in fileids:
         words = append(words, wordlists.words(id))
     fre = FreqDist(word.lower() for word in words if word.isalpha())
     fre.tabulate(n)
     return fre.plot(n)
Exemplo n.º 4
0
def cut_sentence():
    raw_str = open(r'D:\nltk_data\corpora\gutenberg\austen-persuasion.txt','r').read()
    #raw_str = ' Professional -Self Healing Cutting Mat is a cut above the rest!  :) Alvin Professional -Self Alvin Professional -Self'
    cutted = WordPunctTokenizer().tokenize(raw_str)
    # w = []
    # for x in cutted:
    #     if x not in stopwords.words('english'):
    #         w.append(x)
    #filtered = [w  for w in cutted if (w not in stopwords.words('english')]
    #print(w)
    #print(cutted)
    col = FreqDist(cutted)
    #col.plot()
    col.tabulate()
Exemplo n.º 5
0
def testFunc():
    fw = open("./MZI/data.doc", "r", encoding="utf8");
    text = fw.read();
    tockens = getWordList(text)
    print(len(set(tockens)))
    from nltk.probability import FreqDist
    from nltk.util import bigrams
    fdist = FreqDist(w for w in tockens if len(w) > 1);
    fdist.tabulate(50);
    big = list(bigrams(w for w in tockens if len(w) > 1));
    print(big[:100]);
    fdist = FreqDist(str(w) for w in big);
    fdist.tabulate(10);
    fdist.plot(50)
Exemplo n.º 6
0
def Freq_Dist(text):
    tokens = normalize_tokenize(text)

    wubDict = {}
    for word in tokens:
        if word in wubDict:
            wubDict[word] = wubDict[word] + 1
        else:
            wubDict[word] = 1

    len(wubDict) == len(set(tokens))
    if sum(wubDict.values()) == len(tokens):
        print("Our Dictionary and our Tokens match!")
    else:
        print(
            'We have an error in the length of the dictionary and the tokens.')

    from nltk.probability import FreqDist
    wubFD = FreqDist(word for word in tokens)
    print(wubFD.items())

    print('\nOur top 10 most frequent words are:\n\t')
    print(wubFD.tabulate(10))

    import matplotlib
    wubFD.plot(20)
def get_frequency(demo_file_path):
    words_lst = list()
    with open(demo_file_path, 'r') as f:
        content_lst = f.readlines()
        # for sentence in content_lst:
        #     sentence = sentence.strip()
    content_str = " ".join(content_lst)
    words_lst = content_str.split()
    fdist = FreqDist(words_lst)
    import pytest
    pytest.set_trace()
    print(fdist.tabulate(40, cumulative=True))
        tokens = tagger.tag(tokens2)

        if len(tokens) == 0:
            print "empty text found after preprocessing...discard"
            continue

        if i[1] == 5 and pos < 500:
            tuple = (tokens, "good location")
            documents.append(tuple)
            pos += 1

        if i[1] < 3 and neg < 500:
            tuple = (tokens, "bad location")
            documents.append(tuple)
            neg += 1

list = []
for w in documents:
    list.append(w[1])
print len(documents)
dict = FreqDist(list)
dict.tabulate()
for w in dict:
    print w, dict[w]
f1.close()
fout = open("../classification/location/location.dat", "wb")
pickle.dump(documents, fout, protocol=0)
fout.close()
print "Finish\n"
Exemplo n.º 9
0
A = set(allwords)
longwords = [w for w in A if len(w) > 12]  #单词长度>12的所有单词
print(sorted(longwords))

from nltk.probability import FreqDist, ConditionalFreqDist
"""
FreqDist: 创建一个所给数据的频率分布
B(): 不同单词的个数
N(): 所有单词的个数
tabulate(20): 把前20组数据以表格的形式显示出来
fd2.plot(20,cumulative=True): 参数cumulative 对数据进行累计 
"""
fd2 = FreqDist([sx.lower() for sx in allwords if sx.isalpha()])
print("不同单词的个数:%d" % fd2.B())
print("所有单词的个数:%d" % fd2.N())
fd2.tabulate(20)  #把前20组数据 以表格的形式显示出来
fd2.plot(20)
fd2.plot(20, cumulative=True)
"""
freq('the')  #单词the出现的频率
ConditionalFreqDist( ): 条件频率统计的函数,研究类别之间的系统性的差异
"""
from nltk.corpus import inaugural
print(fd2.freq('the'))  #单词the出现的频率
cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids()
                          for w in inaugural.word(fileid)
                          if fileid > '1980' and fileid < '2010')
print(cfd.items())
cfd.plot()
Exemplo n.º 10
0
def removePeriod(words):
    count = 0
    for comm in words:
        if '.' in comm:
            ind = comm.index('.')
            fi = comm[:ind]
            ind += 1
            la = comm[ind:]
            words.append(fi)
            words.append(la)
            del words[count]
        count += 1


with open(library + post_id + '.txt', 'r') as file:
    comments = file.read()
    file.close()

words = word_tokenize(comments)

# filterwords = [w for w in words if not w in stop_words]
filterwords = []
for w in words:
    if not (w in stop_words):
        filterwords.append(w.lower())

fdist = FreqDist(filterwords)
print(fdist.tabulate(50))
print(stop_words)
Exemplo n.º 11
0
import string
import re

for line in fileinput.input(files='-'):
	data = json.loads(line)

toolfdist = FreqDist()
nontoolfdist = FreqDist()

stopwords = nltk.corpus.stopwords.words('english')

for i in range(len(data)):
	text = word_tokenize(data[i]['abstract'])
	if data[i]['is_tool']:
		for word in text:
			word = word.lower()
			if word not in stopwords and word not in string.punctuation and re.fullmatch(r'[0-9\!\"\#\$\%\&\'\(\)\*\+\,\-.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~≥]*', word) is None:
				toolfdist[word] += 1
	else:
		for word in text:
			word = word.lower()
			if word not in stopwords and word not in string.punctuation and re.fullmatch(r'[0-9\!\"\#\$\%\&\'\(\)\*\+\,\-.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~≥]*', word) is None:
				nontoolfdist[word] += 1

for word in toolfdist:
	if word in nontoolfdist:
		toolfdist[word] -= 10*nontoolfdist[word]
		#toolfdist[word] = 0

toolfdist.tabulate(200)
def wordFreqAnalysis(dict_books):

    #%% Retrieving data
    list_sentences = listof_sentences(dict_books)

    #%% NLP
    # Only words - no digits or special characters
    wordonly_tokenizer = RegexpTokenizer(r'\w+')
    words_sentences = [
        wordonly_tokenizer.tokenize(sentence) for sentence in list_sentences
    ]

    # List of list of words
    words_list = [word for element in words_sentences for word in element]

    # All lowercase
    words_lower = [wlo.lower() for wlo in words_list]

    # Removing stopwords
    manual_SW = add_stopwords()
    fullset_SW = stopwords.words('english') + manual_SW
    stopWords = set(fullset_SW)
    words_filtered = [w for w in words_lower if w not in stopWords]

    # Removing single letter words
    words_letter = [wle for wle in words_filtered if len(wle) > 1]

    # Removing a2, c45 etc
    words_alpha = [
        wa for wa in words_letter if not any(c.isdigit() for c in wa)
    ]

    # Lemmatizations
    lemmatizer = WordNetLemmatizer()
    #verb lemmatization
    words_lemmatized_verb = [
        lemmatizer.lemmatize(wv, 'v') for wv in words_alpha
    ]
    #adjective lemmatization
    words_lemmatized_adj = [
        lemmatizer.lemmatize(wadj, 'a') for wadj in words_lemmatized_verb
    ]
    #adverb lemmatization
    words_lemmatized_adv = [
        lemmatizer.lemmatize(wadv, 'r') for wadv in words_lemmatized_adj
    ]
    #noun lemmatization
    words_lemmatized = [
        lemmatizer.lemmatize(wl) for wl in words_lemmatized_adv
    ]

    #%% Dictionary of word count
    wordcount = FreqDist(wc for wc in words_lemmatized)
    #    print("something")
    #    # Suppress console temporarily
    #    sys.stdout = open(os.devnull, "w")
    wordcount.tabulate(10)
    #    sys.stdout = sys.__stdout__
    #    print("nothing")
    #%% Write to table (csv file)
    file_freqtable = "word_freq.csv"
    try:
        with open(file_freqtable, "x") as fp:
            writer = csv.writer(fp)
            writer.writerows(wordcount.items())
        print(file_freqtable, "file is created")
        print("\n -----------------------------------------------\n")
    except IOError:
        print(file_freqtable, "file already exists")
        print("\n -----------------------------------------------\n")

    #%% Word count plot
    print("Plotting Word Count")
    print("\n -----------------------------------------------\n")

    ttl = "Word Frequency for Top 50 Words"
    plt.figure(figsize=(40, 20))
    wordcount.plot(50)
    plt.title(ttl, fontsize=80)
    #plt.xlabel("Words", fontsize = 26, style = "oblique")
    plt.ylabel("Frequency of Words", fontsize=38, style="oblique")
    ax = plt.gca()
    ax.set_xticklabels(ax.get_xticklabels(), fontdict={'fontsize': 30})
    #yticks = [400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
    ax.set_yticklabels(ax.get_yticks(), fontdict={'fontsize': 30})
    plt.show()

    #%% Word cloud
    WC_height = 800
    WC_width = 1600
    WC_max_words = 500

    wordCloud = WordCloud(max_words=WC_max_words,
                          height=WC_height,
                          width=WC_width)

    # Plotting Word cloud with most frequently occurring words (unigrams)
    print("Plotting Word Cloud")
    print("\n -----------------------------------------------\n")
    wordCloud.generate_from_frequencies(wordcount)
    plt.figure(figsize=(36, 18))
    plt.title('Most frequently occurring words (unigrams)', fontsize=60)
    plt.imshow(wordCloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()