Python words 예제들, nltk.corpus.inaugural.words Python 예제들

예제 #1

0

파일 보기

파일: tts.py 프로젝트: shofman/UniversityCoursework

def word_fdist(inaug_list):
	
	fixedspeech=map(str.lower, inaugural.words(inaug_list))		#Applies lower to every element of the list
	fixedspeech=filter(checkwords, fixedspeech)			#Filters false words using helper function checkwords	
	fdist = FreqDist(fixedspeech)					#Assigns it to a frequency disposition
	
	return fdist							#Returns it

예제 #2

0

파일 보기

파일: practice23_a.py 프로젝트: t2y/learnnlp

def main():
    # gutenberg
    gu_words = gutenberg.words()
    gu_words_exclude_stops = exclude_stopwords(gu_words)
    gu_fd1 = get_frequency_distribution(gu_words)
    gu_fd2 = get_frequency_distribution(gu_words_exclude_stops)

    pylab.plot(gu_fd1, color='red')
    pylab.plot(gu_fd2, color='orange')

    # inaugural
    in_words = inaugural.words()
    in_words_exclude_stops = exclude_stopwords(in_words)
    in_fd1 = get_frequency_distribution(in_words)
    in_fd2 = get_frequency_distribution(in_words_exclude_stops)

    pylab.plot(in_fd1, color='black')
    pylab.plot(in_fd2, color='gray')

    # reuters
    yen_words = reuters.words(categories='yen')
    yen_words_exclude_stops = exclude_stopwords(yen_words)
    yen_fd1 = get_frequency_distribution(yen_words)
    yen_fd2 = get_frequency_distribution(yen_words_exclude_stops)

    pylab.plot(yen_fd1, color='blue')
    pylab.plot(yen_fd2, color='green')

    pylab.xscale('log')
    pylab.yscale('log')
    pylab.show()

예제 #3

0

파일 보기

파일: cfdtest.py 프로젝트: jamesfisk/thesisc

def main():

	cfd = nltk.ConditionalFreqDist(
			 (target, file[:4]) 
			 for fileid in inaugural.fileids()
             for w in inaugural.words(fileid)
             for target in ['democracy', 'republic']
             if w.lower().startswith(target))
	cfd.plot()

예제 #4

0

파일 보기

파일: toturial.py 프로젝트: Paul-Lin/misc

def print_inaugural():
    from nltk.corpus import inaugural
    cfd=nltk.ConditionalFreqDist(
        (target,file[:4])
        for fileid in inaugural.fileids()
        for w in inaugural.words(fileid)
        for target in ['america','citizen']
        if w.lower().startswith(target)
    )
    cfd.plot()

예제 #5

0

파일 보기

파일: test_lda.py 프로젝트: lucasnoah/litmetricscore

def build_inaugural_corpus():
    """
    Get a word token list for each doc in the inaugural address corpus
    :return: word_lists
    """
    word_lists = []
    for fileid in inaugural.fileids():
        words = [w for w in inaugural.words(fileid)]
        word_lists.append(words)
    return word_lists

예제 #6

0

파일 보기

파일: c02_text_corpora.py 프로젝트: AkiraKane/Python

def inaugural():

    inaugural.fileids()
    [fileid[:4] for fileid in inaugural.fileids()]

    cfd = nltk.ConditionalFreqDist(
            (target, fileid[:4])
            for fileid in inaugural.fileids()
            for w in inaugural.words(fileid)
            for target in ['america', 'citizen']
            if w.lower().startswith(target))
    cfd.plot()

예제 #7

0

파일 보기

파일: tts.py 프로젝트: shofman/UniversityCoursework

def sent_length_fdist(inaug_list):
	fixedspeech=filter(elimpunct, inaugural.words(inaug_list))
	count = 0
	listcount = []
	for x in range(len(fixedspeech)):
		if fixedspeech[x]=='.':
			listcount.append(count-1)
			count=0
		if fixedspeech[x]=='!':
			listcount.append(count-1)
			count=0
		if fixedspeech[x]=='?':
			listcount.append(count-1)
			count=0
		count+=1
	fdlist = FreqDist( listcount)
	return fdlist

예제 #8

0

파일 보기

파일: c01_distribution.py 프로젝트: AkiraKane/Python

def tabulate():

    cfd = nltk.ConditionalFreqDist(
            (target, fileid[:4])
            for fileid in inaugural.fileids()
            for w in inaugural.words(fileid)
            for target in ['america', 'citizen']
            if w.lower().startswith(target))

    languages = ['Chickasaw', 'English', 'German_Deutsch',
            'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']

    cfd = nltk.ConditionalFreqDist(
            (lang, len(word))
            for lang in languages
            for word in udhr.words(lang + '-Latin1'))

    cfd.tabulate(conditions=['English', 'German_Deutsch'],
            samples=range(10), cumulative=True)

예제 #9

0

파일 보기

파일: tts.py 프로젝트: shofman/UniversityCoursework

def build_cond_fdist():

	cfdist = ConditionalFreqDist()					#Create conditionalFrequency

	for inaug_list in inaug20():					#Go through inaug_list
		period = int(inaug_list[0][0:4])			#Set the period

		for fileid in inaug_list:    				#For all the words in fileid
			words = inaugural.words(fileid)			

			for i in range(len(words)):			#Check all of the words
				pronoun = words[i]			

				if pronoun in ['I', 'my']:		#Print the next word (after I,me)
					cfdist[(pronoun, period)].inc(words[i+1])

				elif pronoun == 'me':                	#Print the previous word (before me)
					cfdist[(pronoun, period)].inc(words[i-1])    

	return cfdist

예제 #10

0

파일 보기

파일: main2.py 프로젝트: bbusching/Stat-312

def main():
  # store word lengths
  brown_word_lens = []
  web_word_lens = []
  inaugural_word_lens = []
  gutenberg_word_lens = []
  genesis_word_lens = []

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      gutenberg_word_lens.append(len(word))

  for file in brown.fileids():
    for word in brown.words(file):
      brown_word_lens.append(len(word))

  for file in webtext.fileids():
    for word in webtext.words(file):
      web_word_lens.append(len(word))

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      inaugural_word_lens.append(len(word))

  for file in genesis.fileids():
    for word in genesis.words(file):
      genesis_word_lens.append(len(word))
  with open("wordlens.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), 
        len(web_word_lens), len(brown_word_lens),
        len(gutenberg_word_lens))):
      for corpus in [genesis_word_lens, inaugural_word_lens,
          web_word_lens, brown_word_lens, gutenberg_word_lens]:
        if(i >= len(corpus)):
          f.write(",")
        else:
          f.write(str(corpus[i]) + ",")
      f.write("\n")

예제 #11

0

파일 보기

파일: tts.py 프로젝트: shofman/UniversityCoursework

def sent_length_fdist(inaug_list):

	fixedspeech=filter(elimpunct, inaugural.words(inaug_list))	#Elimintes the punctuation
	count = 0							#Counter to run alongside for loop
	listcount = []							#List of sentences lengths

	for x in range(len(fixedspeech)):

		if fixedspeech[x]=='.':				
			listcount.append(count-1)			#Appends the counter-1 (for the period) to the list for the sentence length
			count=0						#Resets counter for next sentence

		if fixedspeech[x]=='!':
			listcount.append(count-1)			#Appends the counter-1 (for the exclamation mark) to the list for the sentence length
			count=0		

		if fixedspeech[x]=='?':					#Appends the counter-1 (for the question mark) to the list for the sentence length
			listcount.append(count-1)
			count=0
		count+=1

	fdlist = FreqDist( listcount)					#Makes a distribution

	return fdlist

예제 #12

0

파일 보기

파일: 4.2-7.py 프로젝트: 2015E8007361074/load2python

print fd2.B()
print fd2.N()

fd2.tabulate(20) # 统计全书前20个出现次数最多的单词书

import matplotlib.pyplot as plt

#fd2.plot(20)
#fd2.plot(20, cumulative=True)

"""
美国总统就职演说预料库

"""
from nltk.corpus import inaugural
import nltk

# nltk.download()  先下载inaugural 语料库

fd3 = FreqDist([s for s in inaugural.words()])
print fd3.freq('freedom') # 在整个语料库中freedom出现的频率

# 用词习惯
cfd = ConditionalFreqDist(# 条件频率统计
	(fileid, len(w))
	for fileid in inaugural.fileids()
	for w in inaugural.words(fileid)
	if fileid > '1960'
	)
print cfd.items()[:40]
cfd.plot()

예제 #13

0

파일 보기

파일: fnlp1.py 프로젝트: c-w/ug3_InauguralCorpus

def number_of_word_types(fileid):
    words = inaugural.words(fileid)
    unique_words = _vocabulary(words)
    num_word_types = len(unique_words)
    return num_word_types

예제 #14

0

파일 보기

파일: homework.py 프로젝트: YangRT/py

import nltk
from nltk.corpus import inaugural
from nltk.corpus import wordnet
import random
import re
import math

# 23题_a
# 导入数据
inaugural_words = inaugural.words()
# 创建字典
a = nltk.FreqDist()
# 遍历列表,将词转成小写
fd = nltk.FreqDist([w.lower() for w in inaugural_words])
# 遍历列表,统计对数词频
for key in fd:
    t = math.log10(fd[key])
    a[key] = t

fd2 = dict(fd)
# 将dict 根据词频排序 转成list
voc = sorted(fd2.items(), key=lambda item: item[1], reverse=True)
# 计算 倍数
result = voc[49][1] / voc[149][1]
print(voc[49][0], ' ', voc[149][0])
print("r_a=" + str(result))
# 第一个 与 最后一个
print(voc[0], " ", voc[149])
# 画图
a.plot(150)

예제 #15

0

파일 보기

파일: example.py 프로젝트: HyperionRiaz/HyperionPython

# 	Run your file everytime something new is added so you can see how it works.
# 	There is a compulsory exercise for Task 1 that needs to be completed at the bottom of your corpuses.py

# === Part 1: Importing Corpuses ===

import nltk
from nltk.corpus import inaugural

print inaugural.fileids()

# Run your file.You should see all the text files containing all the speeches of the US presidents that the
# NLTK has saved inside it.
# Now add the lines:

print "=============Words in Obama's Speech ======"
print inaugural.words("2009-Obama.txt")  # Returns a list of all the words in Obama's speech
print "=============Words in Bush's speech ======"
print inaugural.sents("2005-Bush.txt")  # Returns a list of all the sentences in Bush's speech

# As you can see, the words of Obamas speech are printed in a list, as are the sentences of Bush's speech.

# Try add code to your program to find and outprint the first 25 words of Obama's 2009 speech.

# ===  Part 2: Analysing tokens (words) of a text ===

# The term 'token' means a word or a punctuation mark.
# After you've done that, add the following lines to your program

from nltk.book import *

# This may take a while to load. NLTK has many texts stored in it!

예제 #16

0

파일 보기

파일: downTextCorpora.py 프로젝트: dchoi1357/JHU_Misc

        nCatgs[n] = len(reuters.categories(name))
        catgs[n] = ','.join(reuters.categories(name))
        texts[n] = ' '.join(reuters.words(name))

# trimming articles without categories
toTrim = np.invert(np.equal(catgs, None))
catgs = catgs[toTrim]
texts = texts[toTrim]
nCatgs = nCatgs[toTrim]
outNames = outNames[toTrim]

for n in range(len(outNames)):
    with open('./reuters/' + outNames[n] + '.txt', 'w') as f:
        f.writelines('\n'.join(textwrap.wrap(texts[n], 80)))

out = np.vstack((outNames, catgs)).T
out = out[np.argsort(out[:, 0])]
np.savetxt('reuters_catgs.csv', out, fmt='%s', delimiter=',')

## Save inaugural addresses
#nltk.download('inaugural')
from nltk.corpus import inaugural as inaug
adds = inaug.fileids()

texts = np.empty(len(adds), dtype=object)  # pre-allocate
for n, name in enumerate(adds):
    texts[n] = ' '.join(inaug.words(name))
    with open('./inaugural/' + name, 'w') as f:
        tmp = textwrap.wrap(texts[n], 80)
        f.writelines('\n'.join(tmp).encode('ascii', 'ignore'))

예제 #17

0

파일 보기

from nltk.book import *
import nltk

print(text1.vocab())
print(type(text1))
print(len(text1))

from nltk.corpus import gutenberg
print(gutenberg.fileids())
print(nltk.corpus.gutenberg.fileids())
hamlet = gutenberg.words('shakespeare-hamlet.txt')

from nltk.corpus import inaugural
print(inaugural.fileids())
print(nltk.corpus.inaugural.fileids())
from nltk.text import Text
former_president = Text(inaugural.words(inaugural.fileids()[-1]))
print(' '.join(former_president.tokens[0:1000]))

예제 #18

0

파일 보기

파일: p4.py 프로젝트: ashwinlokkur/Machine-Learning-Lab

import nltk

from nltk.corpus import inaugural
from nltk.util import ngrams

obama_words = inaugural.words("2009-Obama.txt")
george_words = inaugural.words("1789-Washington.txt")
fd_george_words = nltk.FreqDist(w.lower() for w in george_words)
fd_obama_words = nltk.FreqDist(w.lower() for w in obama_words)

#fd_obama_words.plot(50)
print(fd_obama_words.most_common(50))
print(fd_george_words.most_common(50))

obama = [x[0] for x in fd_obama_words.most_common(50)]
george = [x[0] for x in fd_george_words.most_common(50)]

print(list(set(obama) & set(george)))

예제 #19

0

파일 보기

from nltk.corpus import inaugural
from nltk import FreqDist
#nltk.download('stopwords')
from nltk.tokenize import regexp_tokenize
print("-------WARM UP---------")
print("------TASK 1---------")
#using inaugural fileids to list all the documents
documents = inaugural.fileids()
print(
    "Using the corpus reader class list all the documents in inaugural corpus :"
)
print(documents)
print("---------------------------------------------------------------------")
print("Find the total number of words in Clinton’s 1993 speech :")
#using .worrds method to count words in clinton speech
clintonwords = (inaugural.words('1993-Clinton.txt'))

print(len(clintonwords))
#.raw method will read the text in raw form
s = inaugural.raw('1789-Washington.txt')
w = set(m.group(0) for m in re.finditer(r"\w+", s))
#print (len(re.findall('\w+', s)))
print("Find the total number of distinct words in the same speech :")
#now we will find length of distinct words
print(len(w))


# average function to calculate average word length
def average(numbers):
    return sum(numbers) / len(numbers)

예제 #20

0

파일 보기

파일: freq_dist_obama.py 프로젝트: Gyanachand1/NLP-1

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 20 08:22:33 2018

@author: jacobjohn

//https://www.jasondavies.com/wordcloud/

"""
import nltk
import re
from nltk.corpus import inaugural

Obama = inaugural.words(fileids='2009-Obama.txt')

#declare a dictionary
word_freq = {}
for tok in Obama:
    if tok in word_freq:
        word_freq[tok] += 1
    else:
        word_freq[tok] = 1

max_dict = {}

while len(max_dict) < 5:
    max_val = 0
    for key in word_freq:
        if max_val < word_freq[key] and re.match(r'[A-Za-z]+',
                                                 key) and key not in max_dict:

예제 #21

0

파일 보기

Created on Sun Dec 24 11:00:43 2017

@author: Mohnish_Devadiga
"""

import nltk
from nltk.corpus import inaugural
import pandas as pd
import matplotlib

inaugural.fileids()

#print(inaugural.fileids())

for speech in inaugural.fileids():
    word_count_total = len(inaugural.words(speech))
    print(speech, word_count_total)

#Go through all speech
speech_length = [(len(inaugural.words(speech)), speech)
                 for speech in inaugural.fileids()]

print(speech_length)

#Get the max and min speech
print("Max is : ", max(speech_length))
print("Min is : ", min(speech_length))

#Avg no of words per sentence for each speech
for speech in inaugural.fileids():
    word_total = len(inaugural.words(speech))

예제 #22

0

파일 보기

def conlisttodic(lst):
	dct= dict()
	for i in range(0, len(lst)):
		for j in range(0,1):
			dct.update({lst[i][j]:lst[i][j+1]})
	return dct
    
def makeStopWords():
	sw = stopwords.words('english')
	for i in wordsStop :
		sw.append(i)ห
	return sw
stopWord = makeStopWords()
for fileID in inaugural.fileids()[-12:]:
	wordList=list()
	for word in inaugural.words(fileID):
		word = word.lower()
		if word.isalpha() and word not in stopWord:
			wordList.append(word)
	speech[fileID] = nltk.FreqDist(wordList)
# 			print(type(speech))

for i,k in speech.items() :
	print(i,k)
	nameyear.append(i)
	worddict[countloop] = k
	countloop = countloop+1
intersectionn= worddict[0] & worddict[1] & worddict[2] &worddict[3] & worddict[4] & worddict[5] &worddict [6] & worddict[7] & worddict[8] & worddict[9] & worddict[10] & worddict[11] 
intersectionnsort = sorted(intersectionn)
for i in range(len(nameyear)):
	for j in intersectionnsort:

예제 #23

0

파일 보기

파일: Corpus.py 프로젝트: vida97/Sentiment-Analysis---Sarcasm-Detection-NLP-Project-

# In[4]:

print(brown.categories())

# In[5]:

brown.words(categories='romance')

# In[6]:

from nltk.corpus import inaugural

# In[7]:

inaugural.fileids()

# In[8]:

inaugural.words(fileids='1989-Bush.txt')

# In[9]:

inaugural.words(fileids='1989-Bush.txt')[:50]

# In[10]:

from nltk.tokenize import TweetTokenizer
text = "Mexico is paying (indirectly) for the Wall through the new USMCA, the replacement for NAFTA! Far more money coming to the U.S. Because of the tremendous dangers at the Border, including large scale criminal and drug inflow, the United States Military will build the Wall!"
twt = TweetTokenizer()
print(twt.tokenize(text))

예제 #24

0

파일 보기

파일: experiments.py 프로젝트: JNazare/inagural_speech_analysis

from nltk.corpus import inaugural, stopwords
# from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models, similarities
import re
import string

filenames = inaugural.fileids()
# lmtzr = WordNetLemmatizer()
filtered_speeches = []

def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

for filename in filenames:
	print filename
	print "Reading in raw words..."
	raw_words = inaugural.words(filename)
	print "Removing stop words..."
	filtered_words = [word for word in raw_words if not word in stopwords.words('english')]
	print "Removing punctuation..."
	filtered_words = [word.strip(string.punctuation) for word in filtered_words]
	filtered_words = [word.lower() for word in filtered_words if word != ""]
	tokens_once = set(word for word in set(filtered_words) if filtered_words.count(word) == 1)
	filtered_words = [removeNonAscii(word) for word in filtered_words if word not in tokens_once]
	print "Appending filtered words..."
	filtered_speeches.append(filtered_words)

print "making numbered corpus..."
dictionary = corpora.Dictionary(filtered_speeches)
corpus = [dictionary.doc2bow(text) for text in filtered_speeches]
tfidf = models.TfidfModel(corpus)

예제 #25

0

파일 보기

파일: main3.py 프로젝트: bbusching/Stat-312

def main():
  # store word lengths
  brown_common_freq = []
  web_common_freq = []
  inaugural_common_freq = []
  gutenberg_common_freq = []
  genesis_common_freq = []

  common = ["the", "be", "to", "of", "and", "a", "in", "that", "have",
            "i", "it", "for", "not", "on", "with", "he", "as", "you",
            "do", "at", "this", "but", "his", "by", "from", "they",
            "we", "say", "her", "she", "or", "an", "will", "my", "one",
            "all", "would", "there", "their", "what", "so", "up", "out",
            "if", "about", "who", "get", "which", "go", "me", "when",
            "make", "can", "like", "time", "no", "just", "him", "know",
            "take", "people", "into", "year", "your", "good", "some",
            "could", "them", "see", "other", "than", "then", "now", "look",
            "only", "come", "its", "over", "think", "also", "back", "after",
            "use", "two", "how", "our", "work", "first", "well", "way",
            "even", "new", "want", "because", "any", "these", "give", "day",
            "most", "us"]
  common.sort()

  for file in gutenberg.fileids():
    total_words = len(gutenberg.words(file))
    total_common = 0
    for word in gutenberg.words(file):
      if word.lower() in common:
        total_common += 1
    gutenberg_common_freq.append(float(total_common)/total_words)

  for file in brown.fileids():
    total_words = len(brown.words(file))
    total_common = 0
    for word in brown.words(file):
      if word.lower() in common:
        total_common += 1
    brown_common_freq.append(float(total_common)/total_words)

  for file in webtext.fileids():
    total_words = len(webtext.words(file))
    total_common = 0
    for word in webtext.words(file):
      if word.lower() in common:
        total_common += 1
    web_common_freq.append(float(total_common)/total_words)

  for file in inaugural.fileids():
    total_words = len(inaugural.words(file))
    total_common = 0
    for word in inaugural.words(file):
      if word.lower() in common:
        total_common += 1
    inaugural_common_freq.append(float(total_common)/total_words)

  for file in genesis.fileids():
    total_words = len(genesis.words(file))
    total_common = 0
    for word in genesis.words(file):
      if word.lower() in common:
        total_common += 1
    genesis_common_freq.append(float(total_common)/total_words)

  with open("common-words.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_common_freq), len(inaugural_common_freq),
                        len(web_common_freq), len(brown_common_freq),
                        len(gutenberg_common_freq))):
      for corpus in [genesis_common_freq, inaugural_common_freq,
                     web_common_freq, brown_common_freq, gutenberg_common_freq]:
        if i >= len(corpus):
          f.write(",")
        else:
          f.write(str(round(corpus[i], 5)) + ",")
      f.write("\n")

예제 #26

0

파일 보기

파일: main.py 프로젝트: bbusching/Stat-312

def main():
#store FreqDist's
#index is the length of the word, 0 is for all words
  samples = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

  brown_letters = FreqDist()
  web_letters = FreqDist()
  inaugural_letters = FreqDist()
  gutenberg_letters = FreqDist()
  genesis_letters = FreqDist()

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      for character in word:
        if(character in string.letters):
            gutenberg_letters[character.upper()] += 1

  for file in brown.fileids():
    for word in brown.words(file):
      for character in word:
        if(character in string.letters):
            brown_letters[character.upper()] += 1

  for file in webtext.fileids():
    for word in webtext.words(file):
      for character in word:
        if(character in string.letters):
            web_letters[character.upper()] += 1

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      for character in word:
        if(character in string.letters):
            inaugural_letters[character.upper()] += 1

  for file in genesis.fileids():
    for word in genesis.words(file):
      for character in word:
        if(character in string.letters):
            genesis_letters[character.upper()] += 1

  with open("genesis-letter-freq.txt",'w') as f:
    sys.stdout = f
    f.write("GENESIS\n")
    for let in samples:
        print(str(genesis_letters[let]))
  
  with open("gutenberg-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("GUTENBERG\n")
    for let in samples:
        print(str(gutenberg_letters[let]))
  with open("webtext-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("WEBTEXT\n")
    for let in samples:
        print(str(web_letters[let]))
  with open("inaugural-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("INAUGURAL\n")
    for let in samples:
        print(str(inaugural_letters[let]))
  with open("brown-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("BROWN\n")
    for let in samples:
        print(str(brown_letters[let]))
  
  with open("letter-freq.txt", 'w') as f:
    corpora = [gutenberg_letters, web_letters, inaugural_letters,
        brown_letters, genesis_letters]
    f.write("GUTENBERG,WEBTEXT,INAUGURAL,BROWN,GENESIS\n")
    for let in samples:
      for corpus in corpora:
        f.write(str(corpus[let]) + ",")
      f.write("\n")

예제 #27

0

파일 보기

파일: nltkdemo.py 프로젝트: xeno6696/ape

print(md[:8])
print("Length of book {}".format(len(md)))
print("Boat: {}".format(md.count('boat')))

md_set = set(md)
print("Unique by set: {}".format(len(md_set)))

print("Average by words: {}".format(len(md) / len(md_set)))

md_sents = nltk.corpus.gutenberg.sents("melville-moby_dick.txt")
print("Average by words per sentence: {}".format(len(md) / len(md_sents)))

print("inaugral Ids:\n{}".format(inaugural.fileids()))

for speech in inaugural.fileids():
    words_total = len(inaugural.words(speech))
    print("Speech: {0} has total words: {1}".format(speech, words_total))

speech_len = [(len(inaugural.words(speech)), speech)
              for speech in inaugural.fileids()]
print("Biggest Speech: {}".format(max(speech_len)))
print("shortest Speech: {}".format(min(speech_len)))

for speech in inaugural.fileids():
    words_total = len(inaugural.words(speech))
    sentence_total = len(inaugural.sents(speech))
    print("Sentence average: {}".format(words_total / sentence_total))

data = pd.DataFrame([
    int(speech[:4]),
    len(inaugural.words(speech)) / len(inaugural.sents(speech))

예제 #28

0

파일 보기

# In[7]:


#INAUGURAL CORPUS


# In[8]:


from nltk.corpus import inaugural


# In[9]:


inaugural.fileids()


# In[10]:


inaugural.words(fileids = '2009-Obama.txt')


# In[11]:


inaugural.words(fileids = '2009-Obama.txt')[:23]

예제 #29

0

파일 보기

파일: tts.py 프로젝트: shofman/UniversityCoursework

def word_fdist(inaug_list):
	fixedspeech=map(str.lower, inaugural.words(inaug_list))
	fixedspeech=filter(checkwords, fixedspeech)
	fdist = FreqDist(fixedspeech)	
	return fdist

예제 #30

0

파일 보기

from nltk.corpus import inaugural

import matplotlib.pyplot as plt
x=inaugural.words('2009-Obama.txt')
l={}
new=[]
k={}
z=set(x)

for word in z:
 
    l[word]=x.count(word)
#print(l)
from nltk.stem import PorterStemmer
ps=PorterStemmer()
for words in x:
    new.append(ps.stem(words))
p=set(new)
for w in p:
    k[w]=new.count(w)
plt.plot(k.values())
#plt.xlabel(k.keys())
k_sorted = sorted(k.items(), key=operator.itemgetter(1),reverse=True)
for word,count in k.items():
    if(count==max(k.values())):
        print(word)

print(k_sorted[0])

예제 #31

0

파일 보기

파일: ec1.py 프로젝트: sgarcia22/Natural-Language-Processing

def chi_square(word_one, word_two, corpus):
    word_list = []
    #Import the necessary corpus
    if corpus == "brown":
        from nltk.corpus import brown
        word_list = brown.words()
    elif corpus == "reuters":
        from nltk.corpus import reuters
        word_list = reuters.words()
    elif corpus == "gutenberg":
        from nltk.corpus import gutenberg
        word_list = gutenberg.words()
    elif corpus == "webtext":
        from nltk.corpus import webtext
        word_list = webtext.words()
    elif corpus == "inaugural":
        from nltk.corpus import inaugural
        word_list = inaugural.words()
    #Get the frequencies of each word
    w1 = word_list.count(word_one)
    w2 = word_list.count(word_two)
    #Get the frequencies of the word as a collocation
    bigrams = nltk.bigrams(word_list)
    freq_dist = nltk.FreqDist(bigrams)
    w1w2 = 0
    w1andnotw2 = 0
    notw1andw2 = 0
    notw1andnotw2 = 0
    total_words = len(word_list)
    for k, v in freq_dist:
        if k == word_one and v == word_two:
            w1w2 = w1w2 + 1
        elif k == word_one and v != word_two:
            w1andnotw2 = w1andnotw2 + 1
        elif k != word_one and v == word_two:
            notw1andw2 = notw1andw2 + 1
    notw1andnotw2 = notw1andw2 + w1andnotw2
    totalw1andw2 = w1w2 + w1andnotw2 + notw1andw2 + notw1andnotw2
    first_row = w1w2 + w1andnotw2
    second_row = notw1andw2 + notw1andnotw2
    first_col = w1w2 + notw1andw2
    second_col = w1andnotw2 + notw1andnotw2
    #Calculate chi-square value
    #Null hypothesis is that there is no collocation between the two words (no relationship)
    #Estimated value for each cell
    value_one = (first_row * first_col) / totalw1andw2
    value_two = (second_row * first_col) / totalw1andw2
    value_three = (first_row * second_col) / totalw1andw2
    value_four = (second_row * second_col) / totalw1andw2
    x2 = ((w1w2 - value_one)**2) / value_one
    x2 = x2 + ((w1andnotw2 - value_three)**2) / value_three
    x2 = x2 + ((notw1andnotw2 - value_four)**2) / value_four
    x2 = x2 + ((notw1andw2 - value_two)**2) / value_two
    #Print out the results
    print("C(w1): ", w1)
    print("C(w2): ", w2)
    print("C(w1w2): ", w1w2)
    print("C(w1 && !w2)", w1andnotw2)
    print("C(!w1 && w2)", notw1andw2)
    print("C(!w1 && !w2)", notw1andnotw2)
    print("Total Words: ", total_words)
    print("")
    print("0.05% Baseline: 3.841")
    print("X^2:", x2)
    collocation = None
    degrees_of_freedom = 1
    if x2 <= 3.841:
        collocation = True
    else:
        collocation = False
    print("Do we have a collocation?", collocation)

예제 #32

0

파일 보기

#Each corpus is accessed by means of a "corpus reader" object from nltk.corpus
print(str(nltk.corpus.brown).replace('\\\\', '/'))
# The Penn Treebank Corpus:
print(str(nltk.corpus.treebank).replace('\\\\', '/'))
# The Name Genders Corpus:
print(str(nltk.corpus.names).replace('\\\\', '/'))
# The Inaugural Address Corpus:
print(str(nltk.corpus.inaugural).replace('\\\\', '/'))
print(str(nltk.corpus.treebank.fileids()))  # doctest: +ELLIPSIS
#print(str(nltk.corpus.inaugural.fileids()) # doctest: +ELLIPSIS
# Each corpus reader provides a variety of methods to read data from the corpus, depending on the format of the corpus.

from nltk.corpus import inaugural
print(inaugural.raw('1789-Washington.txt'))  # doctest: +ELLIPSIS
print(inaugural.words('1789-Washington.txt'))
print(inaugural.sents('1789-Washington.txt'))  # doctest: +ELLIPSIS
print(inaugural.paras(
    '1789-Washington.txt'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE

#

l1 = len(inaugural.words('1789-Washington.txt'))
l2 = len(inaugural.words('1793-Washington.txt'))
l3 = len(inaugural.words(['1789-Washington.txt', '1793-Washington.txt']))
print('%s+%s == %s' % (l1, l2, l3))

print(len(inaugural.words()))

print(inaugural.readme())

예제 #33

0

파일 보기

# In[13]:

from nltk.corpus import inaugural

# In[15]:

inaugural.fileids()

# In[16]:

len(inaugural.fileids())

# In[19]:

inaugural.words(fileids='1861-Lincoln.txt')[:20]

# In[20]:

print(len(inaugural.words(fileids='1861-Lincoln.txt')))

# In[24]:

inaugural.words(fileids='2009-Obama.txt')[:5]

# In[22]:

print(len(inaugural.words(fileids='2009-Obama.txt')))

# In[26]:

예제 #34

0

파일 보기

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# import nltk
# nltk.download('inaugural')
import os


from nltk.corpus import inaugural


corpus_from_paragraphs = inaugural.paras(os.path.dirname(__file__) + '/dataset/paragraphs.txt')
corpus_from_sentences = inaugural.sents(os.path.dirname(__file__) + '/dataset/sentences.txt')
corpus_from_words = inaugural.words(os.path.dirname(__file__) + '/dataset/words.txt')

l1 = len(corpus_from_paragraphs)
l2 = len(corpus_from_sentences)
l3 = len(corpus_from_words)
# l2 = 0
# l3 = 0
print('paragraphs: %s, sentences: %s, words: %s' % (l1, l2, l3))

# print(inaugural.readme())

예제 #35

0

파일 보기

파일: NLP 13 DEC.py 프로젝트: ttevatia/NLP-TASKS

# ## INAUGURAL CORPUS

# In[10]:

from nltk.corpus import inaugural

# In[11]:

inaugural.fileids()

# ### LINCOLN

# In[15]:

inaugural.words(fileids='1861-Lincoln.txt')

# In[16]:

inaugural.words(fileids='1861-Lincoln.txt')[:5]

# ### OBAMA

# In[18]:

inaugural.words(fileids='2009-Obama.txt')

# In[21]:

inaugural.words(fileids='2009-Obama.txt')[:20]

예제 #36

0

파일 보기

from nltk.corpus import reuters

reuters.fileids()
reuters.categories(['training/9865', 'training/8666'])
reuters.fileids(['barley', 'corn'])
reuters.words('training/9865')[:14]
reuters.words(categories=['corn', 'barley'])

from nltk.corpus import inaugural

inaugural.fileids()
inaugYears = [fileid[:4] for fileid in inaugural.fileids()]

cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                               for fileid in inaugural.fileids()
                               for w in inaugural.words(fileid)
                               for target in ['america', 'citizen']
                               if w.lower().startswith(target))
cfd.plot()

from nltk.corpus import udhr

languages = [
    'English', 'Finnish_Suomi', 'Italian_Italiano', 'Greenlandic_Inuktikut'
]
cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages
                               for word in udhr.words(lang + '-Latin1'))
cfd.plot(cumulative=False, title='Declaration of Human Rights')

from nltk.corpus import PlaintextCorpusReader

예제 #37

0

파일 보기

파일: example.py 프로젝트: HyperionRiaz/HyperionPython

#You must work in your Dropbox folder so we can see your progress.
#Run your file everytime something new is added so you can see how it works.
#There is a compulsory exercise for Task 1 that needs to be completed at the bottom of your corpuses.py

# === Part 1: Importing Corpuses ===

import nltk
from nltk.corpus import inaugural
print inaugural.fileids()

#Run your file.You should see all the text files containing all the speeches of the US presidents that the
#NLTK has saved inside it.
#Now add the lines:

print "=============Words in Obama's Speech ======"
print inaugural.words(
    '2009-Obama.txt')  #Returns a list of all the words in Obama's speech
print "=============Words in Bush's speech ======"
print inaugural.sents(
    '2005-Bush.txt')  #Returns a list of all the sentences in Bush's speech

#As you can see, the words of Obamas speech are printed in a list, as are the sentences of Bush's speech.

#Try add code to your program to find and outprint the first 25 words of Obama's 2009 speech.

# ===  Part 2: Analysing tokens (words) of a text ===

#The term 'token' means a word or a punctuation mark.
#After you've done that, add the following lines to your program

from nltk.book import *

예제 #38

0

파일 보기

파일: ai_nlp.py 프로젝트: arun1011/Code_Repo

def cfd(text, tgt_list):
    from nltk.corpus import inaugural
    cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in tgt_list if w.lower().startswith(target))
    #cfd.plot()
    return cfd

예제 #39

0

파일 보기

파일: NLTK_tutorial_NYU.py 프로젝트: rjbashar/NLTK_examples

#############
#Corpus data#
#############

# Inaugural Address Corpus

from nltk.corpus import inaugural
inaugural.fileids()[:2]
[fileid[:4] for fileid in inaugural.fileids()]

#How the words America and citizen are used over time.

cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                               for fileid in inaugural.fileids()
                               for w in inaugural.words(fileid)
                               for target in ['america', 'war']
                               if w.lower().startswith(target))
cfd.plot()
#cfd.tabulate()

from nltk.corpus import brown
news_words = brown.words(categories="news")
print(news_words)
freq = nltk.FreqDist(news_words)
freq.plot(30)

from nltk import FreqDist
verbs = ["should", "may", "can"]
genres = ["news", "government", "romance"]
for g in genres:

예제 #40

0

파일 보기

파일: chunking.py 프로젝트: nivethida/TextMining

import nltk
text = '''Donald John Trump (born June 14, 1946) is the 45th and
current President of the United States. Before entering politics,
he was a businessman and television personality. 
'''
grammer = 'Chunk:{<PRP><VB.+><DT>?<NN.?>}'
pos = nltk.pos_tag(nltk.word_tokenize(text))
parser = nltk.RegexpParser(grammer)
chunked = parser.parse(pos)  #input for parse is list of tuples

for i in chunked.subtrees():
    if i.label() == "Chunk":
        print(i.leaves())

from nltk.corpus import inaugural
text = inaugural.words()
pos = nltk.pos_tag(text)

# JJ +NN but the NN will be 'people'
# !!!!!! important always allow DT iin this kind of structures
jjlist = []
parser = nltk.RegexpParser('chunk:{<JJ.?>+<NN.*>}')
chunk = parser.parse(pos)
for i in chunk.subtrees():
    if i.label() == "chunk" and i.leaves()[-1][0] == 'people':
        # i.leaves()=[('american', 'JJ),('people', 'NN')]
        #i.leaves()[-1] = ('people','NN')
        jj = [x[0] for x in i.leaves()[:-1]]
        jjlist += jj

frequency = nltk.FreqDist(jjlist)

예제 #41

0

파일 보기

print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words("melville-moby_dick.txt"))
print("text1:", text1.name)

text2 = Text(gutenberg.words("austen-sense.txt"))
print("text2:", text2.name)

text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
print("text8:", text8.name)

text9 = Text(gutenberg.words("chesterton-thursday.txt"))

예제 #42

0

파일 보기

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 27 08:19:21 2018

@author: jacobjohn
"""
import nltk
from nltk.corpus import inaugural
import matplotlib.pyplot

cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])  #first four characters - years
    for fileid in inaugural.fileids() for w in inaugural.words(fileid)
    for target in ['america', 'citizen'] if w.lower().startswith(target))
cfd.plot()

예제 #43

0

파일 보기

파일: fnlp1.py 프로젝트: c-w/ug3_InauguralCorpus

def answers():
    _rvals = []

    #### Question 1 ####
    print '##### Question 1 #####'
    print '(see code - lines 64-65)'
    print '(NB: the two variables are returned by this function)'
    _bush01 = inaugural.words('2001-Bush.txt')
    bush01_word_lengths = _lengths(_vocabulary(_bush01))
    fd_bush01_words = FreqDist(_nopunct(_bush01))
    _rvals.append(bush01_word_lengths)
    _rvals.append(fd_bush01_words)

    #### Question 2 ####
    print '\n##### Question 2 #####'
    bush01_top10_words = _firsts(fd_bush01_words.items()[:10])
    bush01_average_word_lengths = _avg(bush01_word_lengths)
    _obama09 = inaugural.words('2009-Obama.txt')
    _fd_obama09_words = FreqDist(_nopunct(_obama09))
    _obama09_word_lengths = _lengths(_vocabulary(_obama09))
    obama09_top10_words = _firsts(_fd_obama09_words.items()[:10])
    obama09_average_word_lengths = _avg(_obama09_word_lengths)
    print 'top10 words Bush (2001): ', _str(bush01_top10_words)
    print 'top10 words Obama (2009):', _str(obama09_top10_words)
    print 'average word length Bush (2001): ', bush01_average_word_lengths
    print 'average word length Obama (2009):', obama09_average_word_lengths

    #### Question 3 ####
    print '\n##### Question 3 #####'
    bush01_token_lengths = _avg(_lengths(_nopunct(_bush01)))
    obama09_token_lengths = _avg(_lengths(_nopunct(_obama09)))
    print 'average token length Bush (2001): ', bush01_token_lengths
    print 'average token length Obama (2009):', obama09_token_lengths

    #### Question 4 ####
    print '\n##### Question 4 #####'
    for _fileid in inaugural.fileids():
        _year = int(_fileid.split('-')[0])
        _vocab_size = number_of_word_types(_fileid)
        print 'year %d: %d word types' % (_year, _vocab_size)

    #### Question 5 ####
    print '\n##### Question 5 #####'
    fd_bush01_nostop = FreqDist(_nostops(_nopunct(_bush01)))
    fd_obama09_nostop = FreqDist(_nostops(_nopunct(_obama09)))
    bush01_top10_nostop = _firsts(fd_bush01_nostop.items()[:10])
    obama09_top10_nostop = _firsts(fd_obama09_nostop.items()[:10])
    print 'top10 non-stop-words Bush (2001): ', _str(bush01_top10_nostop)
    print 'top10 non-stop-words Obama (2009):', _str(obama09_top10_nostop)

    #### Question 6 ####
    print '\n##### Question 6 #####'
    _wash89 = inaugural.words('1789-Washington.txt')
    fd_wash89_nostop = FreqDist(_nostops(_nopunct(_wash89)))
    wash89_top10_nostop = _firsts(fd_wash89_nostop.items()[:10])
    print 'top10 non-stop-words Washington (1789):', _str(wash89_top10_nostop)

    #### Question 7 ####
    print '\n##### Question 7 #####'
    wash89_rank_country = rank(fd_wash89_nostop, 'country')
    obama09_rank_country = rank(fd_obama09_nostop, 'country')
    bush01_rank_country = rank(fd_bush01_nostop, 'country')
    print 'rank of "country" in Washington (1789):', wash89_rank_country
    print 'rank of "country" in Obama (2009):', obama09_rank_country
    print 'rank of "country" in Bush (2001):', bush01_rank_country

    #### Question 8 ####
    print '\n##### Question 7 #####'
    print '(see comments in "rank" function on lines 20-45)'
    
    #### Question 9 ####
    print '\n##### Question 9 #####'
    print '(see plot)'
    ff = inaugural.fileids()
    fdd = {}
    _years = []
    for _fileid in ff:
        fdd[_fileid] = FreqDist(_nostops(inaugural.words(_fileid)))
        _years.append(_fileid[0:4])
    pylab.plot([(lambda d: len(d) / float(d.N()))(fdd[f]) for f in ff])
    pylab.xticks(range(len(ff)), _years, rotation=90)
    pylab.xlim(0, len(ff) - 1)
    pylab.ylabel('ratio of word types to tokens (without stop-words)')
    pylab.xlabel('time')
    pylab.title('f(time) = #(word types) / #(word tokens)')
    pylab.show()

    #### Question 10 ####
    print '\n##### Question 10 #####'
    print '(see plot)'
    obama09top10_butnot_wash89top10 = [word for word in obama09_top10_nostop
        if word in fd_wash89_nostop and word not in wash89_top10_nostop]
    wash89top10_butnot_obama09top10 = [word for word in wash89_top10_nostop
        if word in fd_obama09_nostop and word not in obama09_top10_nostop]
    obama09_word = 'world'
    wash89_word = 'government'
    assert(wash89_word in wash89top10_butnot_obama09top10)
    assert(obama09_word in obama09top10_butnot_wash89top10)
    normalisation_justification = (\
    "We normalise for different sizes in vocabulary by dividing the rank of "
    "some word by the size of the vocabulary in that speech"
    "Since rank is in relation with vocabulary size, this is similar to "
    "getting the maximum rank over all speeches and dividing each rank by that "
    "quantity")
    print normalisation_justification
    _normalised_rank = lambda f, w: min(1, rank(fdd[f], w) / \
        float(len(_vocabulary(_nostops(fdd[f])))))
    pylab.plot([_normalised_rank(f, obama09_word) for f in ff],
        label=obama09_word, color='b')
    pylab.plot([_normalised_rank(f, wash89_word) for f in ff],
        label=wash89_word, color='r')
    pylab.xticks(range(len(ff)), _years, rotation=90)
    pylab.xlim(0, len(ff) - 1)
    pylab.ylabel('normalised word rank (lower is better)')
    pylab.xlabel('time')
    pylab.title('f(time) = word rank / vocabulary size')
    pylab.legend()
    pylab.show()

    #### Question 11 ####
    print '\n##### Question 11 #####'
    observations_on_plots = (\
    "We observe that the rank of 'world' is noisy when observed on the level "
    "of some individual year/inaugural speech. However, when looking at the "
    "larger picture, a trend emerges: 'world''s rank is consistenlty getting "
    "higher over time - an indicator for an ever-globalising and shrinking "
    "world?"
    "\n"
    "We observe that 'government' is a consistently highly ranked word across "
    "time - expcept for some few inaugural speeches where it has a very low "
    "rank. Those speeches are around the early 1800s (abolishment of slavery),"
    " 1860s-70s (US civil war), the early 1900s (Word War One), and 1937-1981"
    "(World War Two + Cold War) - it would seem that presidents don't want to "
    "remind their subjugates of the government during hard times. Outliers to "
    "this theory can be explained easily (e.g. somewhat high rank of "
    "'government' in 1949 = a certain 'evil government' being defeated).")
    print observations_on_plots

    return _rvals

예제 #44

0

파일 보기

파일: game_wsd.py 프로젝트: DefinitelyArnav/Word-Sense-Disambiguation-SE316-NLP-

for word1, word2 in product(word_list, word_list):
    count = 0

    n_grams = ngrams(brown.words(), n)

    for grams in n_grams:
        if word1 in grams and word2 in grams:
            count += 1

    n_grams = ngrams(treebank.words(), n)

    for grams in n_grams:
        if word1 in grams and word2 in grams:
            count += 1

    n_grams = ngrams(inaugural.words(), n)

    for grams in n_grams:
        if word1 in grams and word2 in grams:
            count += 1

    n_grams = ngrams(names.words(), n)

    for grams in n_grams:
        if word1 in grams and word2 in grams:
            count += 1

    n_grams = ngrams(gutenberg.words(), n)

    for grams in n_grams:
        if word1 in grams and word2 in grams:

예제 #45

0

파일 보기

파일: inaugural_word_usage.py 프로젝트: xiaohan2012/nltk-book

from nltk.corpus import inaugural as inag
from nltk import ConditionalFreqDist as CondFreqDist
cfd = CondFreqDist([(target , fileid[:4])\
		for fileid in inag.fileids() \
			for word in inag.words(fileid) \
				for target in ["wealth" , "peace" , "harmony" , "prosperous"] if word.lower().startswith(target)
		])
cfd.plot()

예제 #46

0

파일 보기

파일: book.py 프로젝트: 2ricecrackerfolder/twittermood

print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print("text1:", text1.name)

text2 = Text(gutenberg.words('austen-sense.txt'))
print("text2:", text2.name)

text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
print("text8:", text8.name)

text9 = Text(gutenberg.words('chesterton-thursday.txt'))

예제 #47

0

파일 보기

파일: lda_2.py 프로젝트: auroua/test

__author__ = 'auroua'
from nltk.corpus import inaugural
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
from lda_1 import LDA
import seaborn as sns

stops = set(stopwords.words("english"))

vocab = dict()
for fileid in inaugural.fileids():
    for word in inaugural.words(fileid):
        word = word.lower()
        if word not in stops and word.isalpha():
            if word not in vocab:
                vocab[word] = 0
            vocab[word] += 1

"""
Sort the vocab keep only words which occur more than 50 times
Then Create word to id and id to word dictionaries
"""
vocab_sorted = filter(lambda x: x[1] > 50, sorted(vocab.items(), key=lambda x: x[1], reverse=True))
wordids = {v[0]: i for i, v in enumerate(vocab_sorted)}
idwords = {i: v[0] for i, v in enumerate(vocab_sorted)}
vocab_size = len(wordids)
print vocab_size

# Generate corpus document vectors
data = []

예제 #48

0

파일 보기

파일: lesson4.2.4.py 프로젝트: dasbro6/newtry

# from nltk.corpus import gutenberg
# # print(gutenberg.fileids())
# allwords = gutenberg.words('shakespeare-hamlet.txt')
# print(len(allwords))
# print(len(set(allwords)))
# print(allwords.count('Hamlet'))
# A = set(allwords)
# longwords = [w for w in A if len(w)>12]
# print(sorted(longwords))
#
#
from nltk.probability import *
# fd2 = FreqDist([sx.lower() for sx in allwords if sx.isalpha()])
# print(fd2.B())
# print(fd2.N())
# # fd2.tabulate(20)
# fd2.plot(20)
# # fd2.plot(20,cumulative = True)

from nltk.corpus import inaugural
fd3 = FreqDist([s for s in inaugural.words()])
print(fd3.freq('freedom'))

cfd = ConditionalFreqDist(
    (fileid,len(w))
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    if fileid>'1980' and fileid<'2010'
)
print(cfd.items())
cfd.plot()

예제 #49

0

파일 보기

파일: Ngrams_Naresh.py 프로젝트: nareshshah139/IE-Group-D-Term3

webtext_words = webtext.words()
print(webtext_words)

# Pick out the text from np_chat corpus and name it as nps_chat_raw
nps_chat_raw = nps_chat.raw()

# Pick out the text from brown corpus and name it as brown_raw
brown_raw = brown.raw()
print(brown_raw)

# Pick out the text from reuters corpus and name it as reuters_words
reuters_words = reuters.words()
print(reuters_words)

# Pick out the text from inaugural corpus and name it as inaugral_raw
inaugral_words = inaugural.words()
print(inaugral_words)

# Creating a variable for tokenizing words
tokenizer = RegexpTokenizer(r'\w+')

# Tokenizing the words in gutenberg corpus and assigning it to a variable named tokens
tokens = tokenizer.tokenize(gutenberg_raw)


# Assigning the stopwords to a variable s
s=set(stopwords.words('english'))

# Removing the stopwords from gutenberg file
gutenberg_filtered = filter(lambda w: not w in s,tokens)

예제 #50

0

파일 보기

파일: nltk2.py 프로젝트: STIMALiU/TextMiningCourse

from nltk.corpus import reuters
reuters.fileids()
reuters.categories(['training/9865', 'training/8666'])
reuters.fileids(['barley','corn'])
reuters.words('training/9865')[:14]
reuters.words(categories = ['corn','barley'])

from nltk.corpus import inaugural
inaugural.fileids()
inaugYears = [fileid[:4] for fileid in inaugural.fileids()]

cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america','citizen']
    if w.lower().startswith(target))
cfd.plot()

from nltk.corpus import udhr
languages = ['English','Finnish_Suomi','Italian_Italiano', 'Greenlandic_Inuktikut']
cfd = nltk.ConditionalFreqDist(
    (lang, len(word))
    for lang in languages
    for word in udhr.words(lang + '-Latin1'))
cfd.plot(cumulative = False, title = 'Declaration of Human Rights')

from nltk.corpus import PlaintextCorpusReader
corpusRoot = '/home/mv/Dropbox/Computer/UbuntuInstall'
wordlists = PlaintextCorpusReader(corpusRoot,'.*')

예제 #51

0

파일 보기

파일: Words_per_sentence.py 프로젝트: Mohnish226/Learn-NLP

Created on Sun Dec 24 11:00:43 2017

@author: Mohnish_Devadiga
"""

import nltk
from nltk.corpus import inaugural
import pandas as pd
import matplotlib 

inaugural.fileids()

#print(inaugural.fileids())

for speech in inaugural.fileids():
    word_count_total = len(inaugural.words(speech))
    print(speech , word_count_total)
    
#Go through all speech     
speech_length = [(len(inaugural.words(speech)), speech)for speech in inaugural.fileids()]

print(speech_length)

#Get the max and min speech
print("Max is : ",max(speech_length))
print("Min is : ",min(speech_length))

#Avg no of words per sentence for each speech
for speech in inaugural.fileids():
    word_total = len(inaugural.words(speech))
    Sents_total = len(inaugural.sents(speech))

예제 #52

0

파일 보기

파일: Task 7.py 프로젝트: Vasudhatapriya/NATUARAL-LANGUAGE-PROCESSING

#importing library
from nltk.corpus import inaugural


# In[6]:


inaugural.fileids()


# In[7]:


#printing inaugral words for some text
for i in inaugural.words('1933-Roosevelt.txt'):
    print(i, end = " ")


# In[8]:


'''
College is so hectic,I'm tired
'''


# In[9]:


#importing library from nltk.corpus

예제 #53

0

파일 보기

파일: books.py 프로젝트: neuroph12/nlpy

def text4():
    text = Text(inaugural.words(), name="Inaugural Address Corpus")
    print("text4:", text.name)
    return text

예제 #54

0

파일 보기

from itertools import groupby

from nltk import pos_tag
from nltk.chunk import ne_chunk
from nltk.corpus import inaugural
from nltk.tag import StanfordNERTagger
from nltk.tree import Tree

# Uncomment to check the required StanfordNERTagger environment variables.
# print os.environ.get("CLASSPATH")
# print os.environ.get("STANFORD_MODELS")

# Read the corpus and POS tag it.
POS_tagging = pos_tag(inaugural.words())

# Process the corpus with the NLTK named entity classifier.
ne_nltk = ne_chunk(POS_tagging)

# Filter out in a list only the organization entities. Join by space words that are part of the same organization entity (same Tree object).
nltk_organizations = [
    " ".join(w[0] for w in el) for el in ne_nltk
    if (type(el) == Tree and el.label() == "ORGANIZATION")
]

# Remove duplicates.
nltk_organizations = set(nltk_organizations)

# Filter out in a list only the person entities. Join by space words that are part of the same person entity (same Tree object).
nltk_persons = [
    " ".join(w[0] for w in el) for el in ne_nltk
    if (type(el) == Tree and el.label() == "PERSON")

예제 #55

0

파일 보기

파일: test.py 프로젝트: zhurui1351/RSTOCK_TRAIL

cfd.plot(cumulative = True)
cfd.tabulate(conditions=['English', 'German_Deutsch'],samples=range(10), cumulative=True)
#条件频率分布
genre_word = [(genre, word) for genre in ['news', 'romance'] 
for word in brown.words(categories=genre)]

cfd = nltk.ConditionalFreqDist(genre_word)
cfd.conditions()
list(cfd['romance'])
cfd['romance']['could']

from nltk.corpus import inaugural
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target))
#随机产生文本
def generate_model(cfdist, word, num=15):
    for i in range(num):
        print word,
        word = cfdist[word].max()
text = nltk.corpus.genesis.words('english-kjv.txt')
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)
print cfd['living']
generate_model(cfd, 'living')


def unusual_words(text):

예제 #56

0

파일 보기

파일: l23_ConditionalFreqDist-单词预测.py 프로젝트: coder352/shellscript

genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)]
print(len(genre_word))  # 170576 个词类
print(genre_word[:4])  # [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ('news', 'Grand')] # [_start-genre]
print(genre_word[-4:])  # [('romance', 'afraid'), ('romance', 'not'), ('romance', "''"), ('romance', '.')] # [_end-genre]
cfd = ConditionalFreqDist(genre_word)
print(cfd)  # <ConditionalFreqDist with 2 conditions>
print(cfd.conditions())  # ['news', 'romance'] # [_conditions-cfd]
print(cfd['news'])  # <FreqDist with 14394 samples and 100554 outcomes>
print(cfd['romance'])  # <FreqDist with 8452 samples and 70022 outcomes>
print(cfd['romance'].most_common(2))  # [(',', 3899), ('.', 3736)]
print(cfd['romance']['could'])  # 193
print(cfd['romance'].max())  # 找到 romance 中最大的
print(cfd['romance'][','])  # 3899
##################################################################
## plot() how the words America and citizen are used over time; 美国总统就职演讲, 使用 America 和 citizen 情况
cfd = ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for word in inaugural.words(fileid) for target in ['america', 'citizen'] if word.lower().startswith(target))
cfd.plot()  # 绘制演讲中出现 America 和 citizen 次数
##################################################################
## tabulate(); 提取词对
# Next, let's combine regular expressions with conditional frequency distributions.
# Here we will extract all consonant-vowel sequences from the words of Rotokas, such as ka and si. Since each of these is a pair,
# it can be used to initialize a conditional frequency distribution. We then tabulate the frequency of each pair:
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
print(cvs[:10])  # ['ka', 'ka', 'ka', 'ka', 'ka', 'ro', 'ka', 'ka', 'vi', 'ko']
cfd = ConditionalFreqDist(cvs)
cfd.tabulate()
#     a    e    i    o    u
# k  418  148   94  420  173
# p   83   31  105   34   51
# r  187   63   84   89   79

예제 #57

0

파일 보기

파일: rerun_experiments.py 프로젝트: JNazare/inagural_speech_analysis

def process_speech(filename):
	text = inaugural.words(filename)
	text = remove_punctuation(text)
	text = remove_stopwords(text)
	text = clean(text)
	return text

예제 #58

0

파일 보기

파일: Brown & Gutenberg.py 프로젝트: Ritwik411/Wintersem_NLP

# -*- coding: utf-8 -*-
"""
Created on Fri Dec 20 16:45:50 2019

@author: Ritwik Gupta
"""

#20/12/19

from nltk.corpus import brown
brown.categories()
print(brown.words(categories='hobbies')[0:5])

from nltk.corpus import inaugural
inaugural.fileids()
inaugural.words(fileids='1933-Roosevelt.txt')[0:10]

from nltk.corpus import webtext
d1 = {}
for i in webtext.fileids():
    d1[i] = webtext.words(fileids=i)[:20]

#Downloaded the MASC data
import nltk
with open('tweets1.txt', 'r') as f:
    text = f.read().strip()
    text1 = text.split()
    text2 = nltk.Text(text1)
    text2.concordance("good", 1)

#Project Gutenberg