fdist1.plot(20, cumulative=False) #Print list of all of frequently used words top = [] print "\nMost Frequent Terms" for (key, value) in sorted(fdist1.items(), key=operator.itemgetter(0)): if value > 2: top.append(key) print key, ":", value #concordance print "\nConfiguring concordance of most frequently used words" for word in top: print " " print file.concordance(word, 150, lines=all) #similar print "\nWords similar to most used words throughout book:" for word in top: print word, ":", file2.similar(word) print " " #Dispersion plots of top10 and collocations# print "\nProcessing dispersion plot of ten most common words..." print file.dispersion_plot(top) sys.exit()
for file in files: if file.endswith(".txt"): print file file_name = raw_input("Choose the file:") print "The file that was chosen is {0}".format(file_name) from nltk.corpus import PlaintextCorpusReader corpus_root = "." # "." means the existing directory I am in search_text = PlaintextCorpusReader(corpus_root, file_name) search_text = nltk.Text(search_text.words()) #creates text object keyword = raw_input("Specify word to search:") search_text.concordance(keyword, 80, lines=30) ##NEW THING## from nltk.corpus import PlaintextCorpusReader corpus_root = '.' search_text = PlaintextCorpusReader(corpus_root, file_name) search_text = nltk.Text(search_text.words()) # from nltk.corpus import stopwords ## path is andreaantenan/Desktop/cs195/nltk_data/corpora/stopwords/english.txt stopwords = nltk.corpus.stopwords.words('bible.txt') search_text = [word for word in search_text if word.lower() not in stopwords] #frequency distribution vocabulary list; fd is a dictionary#
words = text.words('e961024.htm') words = list(words) #Convertir a lista de palabras #Own corpus way 2 import nltk path = '/Users/27AGO2019/Desktop/AbiiSnn/GitHub/Natural-Language-Processing/corpus/e961024.htm' f = open(path, encoding='utf-8') #Cod: utf-8, latin-1 text_string = f.read() f.close() tokens = nltk.word_tokenize(text_string) text = nltk.Text(tokens) print(text[:100]) #Separo los simbolos text.concordance('actividad') text.similar('actividad') print(type(text)) print(len(text)) # HTML from bs4 import BeautifulSoup soup = BeautifulSoup(text_string, 'lxml') text = soup.get_text() type(text) tokens = nltk.word_tokenize(text) tokens
#file_name = sys.argv[1] #search_word = sys.argv[2] file_name = raw_input("\nChoose one of these files: ") print "\nThe file that will be examined is {0}".format(file_name) from nltk.corpus import PlaintextCorpusReader corpus_root = '.' search_text = PlaintextCorpusReader(corpus_root, file_name) search_text = nltk.Text(search_text.words()) # KWIC concordance search_word = raw_input( "Specify a search word for a keyword in context concordance list: ") search_text.concordance(search_word, 80, lines=1000) # Apply stopwords to search_text from nltk.corpus import stopwords stopwords = nltk.corpus.stopwords.words('bible') #/Users/barrybandstra/nltk_data/corpora/stopwords search_text = [word for word in search_text if word.lower() not in stopwords] # Write search to output.txt file" output_file = open("output.txt", "w") for line in search_text: output_file.write(line), "\n" output_file.close() # Frequency distribution vocabulary list; fd is a dictionary fd = nltk.FreqDist(search_text)
wordlists = PlaintextCorpusReader(curr_dir, '/ASOIAF/*.txt') wordlists = PlaintextCorpusReader(curr_dir+'/ASOIAF/', *.txt') wordlists = PlaintextCorpusReader(curr_dir+'/ASOIAF/', '*.txt') curr_dir = os.system('ls '+curr_dir+'/ASOIAF/') os.system('ls '+curr_dir+'/ASOIAF/') wordlists = PlaintextCorpusReader(curr_dir+'/ASOIAF/', '*.txt') os.system("ls "+curr_dir) os.system("ls "+curr_dir.str()) curr_dir os.path.dirname(os.path.realpath(__file__)) os.getcwd() curr_dir = os.getcwd() os.system("ls "+curr_dir) wordlists = PlaintextCorpusReader(curr_dir+'/ASOIAF/', '*.txt') os.system("ls "+curr_dir+'/ASOIAF/', '*.txt') os.system("ls "+curr_dir+'/ASOIAF/') os.system("ls "+curr_dir+'/ASOIAF/') wordlists = PlaintextCorpusReader(curr_dir+'/ASOIAF/', '*.txt') wordlists = PlaintextCorpusReader(curr_dir+'/ASOIAF/', '.*\.txt') wordlist wordlists wordlists.words() wordlists.concordance("Arya") wordlists.fileids() # # Can also import bracket parse corpora (penn tree bank) get_ipython().magic('save -f january_26_2016.py 0 - *') get_ipython().magic('save -f january_26_2016.py') get_ipython().magic('save -f january_23_2016.py 0-*') get_ipython().magic('save january_26_2016.py 0-1000000')
#file_name = sys.argv[1] #search_word = sys.argv[2] file_name = raw_input("\nChoose one of these files: ") print "\nThe file that will be examined is {0}".format(file_name) from nltk.corpus import PlaintextCorpusReader corpus_root = '.' search_text = PlaintextCorpusReader(corpus_root,file_name) search_text = nltk.Text(search_text.words()) # KWIC concordance search_word = raw_input("Specify a search word for a keyword in context concordance list: ") search_text.concordance(search_word,80,lines=1000) # Apply stopwords to search_text from nltk.corpus import stopwords stopwords = nltk.corpus.stopwords.words('bible') #/Users/barrybandstra/nltk_data/corpora/stopwords search_text = [word for word in search_text if word.lower() not in stopwords] # Write search to output.txt file" output_file = open("output.txt", "w") for line in search_text: output_file.write(line),"\n" output_file.close() # Frequency distribution vocabulary list; fd is a dictionary fd = nltk.FreqDist(search_text)
import nltk import os # Retrieve a file list files = os.listdir('.') print "All the files in the directory:" for file in files: if file.endswith('.txt'): print file file_name = raw_input("Choose a file: ") print "The file that was chosen is {0}".format(file_name) from nltk.corpus import PlaintextCorpusReader corpus_root = "." search_text = PlaintextCorpusReader(corpus_root,file_name) search_text = nltk.Text(search_text.words()) keyword = raw_input("Specify a search term: ") search_text.concordance(keyword,80,lines=30)