Пример #1
0
fdist1.plot(20, cumulative=False)

#Print list of all of frequently used words
top = []

print "\nMost Frequent Terms"
for (key, value) in sorted(fdist1.items(), key=operator.itemgetter(0)):
    if value > 2:
        top.append(key)
        print key, ":", value

#concordance
print "\nConfiguring concordance of most frequently used words"
for word in top:
    print " "
    print file.concordance(word, 150, lines=all)

#similar

print "\nWords similar to most used words throughout book:"
for word in top:
    print word, ":", file2.similar(word)
    print " "

#Dispersion plots of top10 and collocations#
print "\nProcessing dispersion plot of ten most common words..."
print file.dispersion_plot(top)

sys.exit()
Пример #2
0
for file in files:
    if file.endswith(".txt"):
        print file

file_name = raw_input("Choose the file:")

print "The file that was chosen is {0}".format(file_name)

from nltk.corpus import PlaintextCorpusReader
corpus_root = "."  # "." means the existing directory I am in
search_text = PlaintextCorpusReader(corpus_root, file_name)
search_text = nltk.Text(search_text.words())  #creates text object

keyword = raw_input("Specify word to search:")

search_text.concordance(keyword, 80, lines=30)

##NEW THING##
from nltk.corpus import PlaintextCorpusReader
corpus_root = '.'
search_text = PlaintextCorpusReader(corpus_root, file_name)
search_text = nltk.Text(search_text.words())

#
from nltk.corpus import stopwords
## path is andreaantenan/Desktop/cs195/nltk_data/corpora/stopwords/english.txt
stopwords = nltk.corpus.stopwords.words('bible.txt')

search_text = [word for word in search_text if word.lower() not in stopwords]

#frequency distribution vocabulary list; fd is a dictionary#
Пример #3
0
words = text.words('e961024.htm')
words = list(words)  #Convertir a lista de palabras

#Own corpus way 2
import nltk

path = '/Users/27AGO2019/Desktop/AbiiSnn/GitHub/Natural-Language-Processing/corpus/e961024.htm'
f = open(path, encoding='utf-8')  #Cod: utf-8, latin-1
text_string = f.read()
f.close()

tokens = nltk.word_tokenize(text_string)
text = nltk.Text(tokens)
print(text[:100])  #Separo los simbolos

text.concordance('actividad')
text.similar('actividad')

print(type(text))
print(len(text))

# HTML
from bs4 import BeautifulSoup

soup = BeautifulSoup(text_string, 'lxml')
text = soup.get_text()
type(text)

tokens = nltk.word_tokenize(text)
tokens
Пример #4
0
#file_name = sys.argv[1]
#search_word = sys.argv[2]

file_name = raw_input("\nChoose one of these files: ")

print "\nThe file that will be examined is {0}".format(file_name)

from nltk.corpus import PlaintextCorpusReader
corpus_root = '.'
search_text = PlaintextCorpusReader(corpus_root, file_name)
search_text = nltk.Text(search_text.words())

# KWIC concordance
search_word = raw_input(
    "Specify a search word for a keyword in context concordance list: ")
search_text.concordance(search_word, 80, lines=1000)

# Apply stopwords to search_text
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('bible')
#/Users/barrybandstra/nltk_data/corpora/stopwords
search_text = [word for word in search_text if word.lower() not in stopwords]

# Write search to output.txt file"
output_file = open("output.txt", "w")
for line in search_text:
    output_file.write(line), "\n"
output_file.close()

# Frequency distribution vocabulary list; fd is a dictionary
fd = nltk.FreqDist(search_text)
Пример #5
0
wordlists = PlaintextCorpusReader(curr_dir, '/ASOIAF/*.txt')
wordlists = PlaintextCorpusReader(curr_dir+'/ASOIAF/', *.txt')
wordlists = PlaintextCorpusReader(curr_dir+'/ASOIAF/', '*.txt')
curr_dir = os.system('ls '+curr_dir+'/ASOIAF/')
os.system('ls '+curr_dir+'/ASOIAF/')
wordlists = PlaintextCorpusReader(curr_dir+'/ASOIAF/', '*.txt')
os.system("ls "+curr_dir)
os.system("ls "+curr_dir.str())
curr_dir
os.path.dirname(os.path.realpath(__file__))
os.getcwd()
curr_dir = os.getcwd()
os.system("ls "+curr_dir)
wordlists = PlaintextCorpusReader(curr_dir+'/ASOIAF/', '*.txt')
os.system("ls "+curr_dir+'/ASOIAF/', '*.txt')
os.system("ls "+curr_dir+'/ASOIAF/')
os.system("ls "+curr_dir+'/ASOIAF/')
wordlists = PlaintextCorpusReader(curr_dir+'/ASOIAF/', '*.txt')
wordlists = PlaintextCorpusReader(curr_dir+'/ASOIAF/', '.*\.txt')
wordlist
wordlists
wordlists.words()
wordlists.concordance("Arya")
wordlists.fileids()
#
# Can also import bracket parse corpora (penn tree bank)
get_ipython().magic('save -f january_26_2016.py 0 - *')
get_ipython().magic('save -f january_26_2016.py')
get_ipython().magic('save -f january_23_2016.py 0-*')
get_ipython().magic('save january_26_2016.py 0-1000000')
Пример #6
0
#file_name = sys.argv[1]
#search_word = sys.argv[2]

file_name = raw_input("\nChoose one of these files: ")

print "\nThe file that will be examined is {0}".format(file_name)

from nltk.corpus import PlaintextCorpusReader
corpus_root = '.'
search_text = PlaintextCorpusReader(corpus_root,file_name)
search_text = nltk.Text(search_text.words())


# KWIC concordance
search_word = raw_input("Specify a search word for a keyword in context concordance list: ")
search_text.concordance(search_word,80,lines=1000)

# Apply stopwords to search_text
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('bible')
#/Users/barrybandstra/nltk_data/corpora/stopwords
search_text = [word for word in search_text if word.lower() not in stopwords]

# Write search to output.txt file"
output_file = open("output.txt", "w")
for line in search_text:
	output_file.write(line),"\n"
output_file.close()

# Frequency distribution vocabulary list; fd is a dictionary
fd = nltk.FreqDist(search_text)
Пример #7
0
import nltk
import os

# Retrieve a file list
files = os.listdir('.')
print "All the files in the directory:"
for file in files:
	if file.endswith('.txt'):
		print file

file_name = raw_input("Choose a file: ")

print "The file that was chosen is {0}".format(file_name)

from nltk.corpus import PlaintextCorpusReader
corpus_root = "."
search_text = PlaintextCorpusReader(corpus_root,file_name)
search_text = nltk.Text(search_text.words())

keyword = raw_input("Specify a search term: ")

search_text.concordance(keyword,80,lines=30)