from nltk.corpus.reader import WordListCorpusReader import nltk # print(nltk.data.find('corpora/cookbook')) # print(nltk.data.find('corpora/cookbook/wordlist.txt')) d = nltk.data.find('corpora/cookbook') reader = WordListCorpusReader(d, ['wordlist.txt']) print(reader.words()) print(reader.fileids())
def find_info_type(self): type_list = [] #list of all types('abstract','speaker') content_list = [] #list with content reader = WordListCorpusReader(self.folder_name, [self.file_name]) all_words = reader.words() #is the mail a proper one? if (all_words == []): return ([], []) #append the first tag of the mail ex:<0.1....> type_list.append("") content_list.append(all_words[0]) for w in all_words[1:]: #search for pattern like "Abstract: ..." type = re.search('^(\w+)(:)', w) #using group functionality to split the topic and content if (type != None): type_list.append(type.group(1)) content = re.search('^(\w+:)(.*)', w) content_list.append(content.group(2)) #not the best way to add the \n splitted content but... elif (len(content_list) > 0): last_element = content_list[-1] extra_content = w last_element = last_element + "\n" + extra_content content_list[-1] = last_element #if the type_list[0] will be 'abstract' then content_list[0] will be the abstract content return (type_list, content_list)
def __init__(self, punctuation_marks: str, corpus_dir: str, corpus_files: list): reader = WordListCorpusReader(corpus_dir, corpus_files) self.vi_dict = set(reader.words()) # Thêm các dấu vào từ điển, xem như nó đúng chỉnh tả self.vi_dict.update(list(punctuation_marks)) # Thêm một số từ đặc biệt self.vi_dict.update( ['m', 'g', 'gt', 'kg', 'km', 'mm', 'cm', 'c', 'f', 't']) self.re_d = re.compile(r'\d')
def addingCorpus(): path = os.path.expanduser('~/nltk_data') if not os.path.exists(path): os.mkdir(path) print(os.path.exists(path)) print(nltk.data.path) print(path in nltk.data.path) nltk.data.load('corpora/cookbook/cookbook.txt', format='raw') reader = WordListCorpusReader('/Users/Dell/nltk_data/corpora/cookbook/', ['wordlist.txt']) print(reader.words())
def tokenize_file(file, corpus_root, english_stops): #tokenize input file, count words, characters, remove stopwords tokenizer = RegexpTokenizer(r'\w+') item_count = 0 total_chars = 0 word_count = 0 wordlist = [] reader = WordListCorpusReader(corpus_root, file) chunks = reader.words() for item in chunks: total_chars += len(chunks[item_count]) word_tokens = tokenizer.tokenize(chunks[item_count]) word_count += len(word_tokens) item_count += 1 for word in word_tokens: wordlist.append(word) stopsout = [word for word in wordlist if word.lower() not in english_stops] return wordlist, stopsout, word_count, total_chars
tagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/training" untagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/seminar_testdata/test_untagged" general_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/Data" l_names = WordListCorpusReader(general_data_filepath, ["names.family"]).words() file_names = [ f for f in listdir(untagged_data_filepath) if isfile(join(untagged_data_filepath, f)) ] file_names = file_names[1:] reader = WordListCorpusReader(untagged_data_filepath, [file_names[0]]) corpus = reader.raw() words = reader.words() def get_tags_by_name(corpus, name): return re.findall(r"<" + name + r">.+</" + name + r">", corpus) def tokenise(corpus): return re.findall("([^\s<>]+)[\s\n<>]", corpus) def get_name_of_poster(corpus): return re.findall() def names_in_file(corpus):
# from nltk.stem import WordNetLemmatizer # wnl = WordNetLemmatizer() # print(wnl.lemmatize('monsters')) ''' In each of the above cases we have handled one word. Now print the stemmed and lemmatized versions of all the words in the document computerscience.txt Preview the document. Here is an overview of what you need to do: 1. Load the file into a reader [ Hint: reader = WordListCorpusReader( ... ) ] 2. use word_tokenize from nltk.tokenize to convert the text into words 3. Loop through the text [Hint: Use the for statement] 4. Lemmatize and Stem each word. 5. Look at the difference between the two, notice how the lemmatizer makes mistakes in some cases - can you identify why and propose a solution? ''' from nltk.corpus.reader import WordListCorpusReader tokens = [] reader = WordListCorpusReader('./', ['computerscience.txt']) for count, ele in enumerate(reader.words()): print(count, "\b:", ele, "\n") tokens += nltk.word_tokenize(ele) print(tokens) from nltk.stem.porter import * stemmer = PorterStemmer() from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() for token in tokens: print(token) print(wnl.lemmatize(token)) print(stemmer.stem(token))
# 35. Possessive wh-pronoun WP_ = 'WP$' # 36. Wh-adverb WRB = 'WRB' @staticmethod def nounish(word, pos): # nltk apparently defaults to 'NN' for smileys :) so special-case those return pos in (POS.NN, POS.NNS, POS.NNP, POS.NNPS) and \ any(c.isalpha() for c in word) mass_noun_corpora = WordListCorpusReader('wordlist/massnoun', r'[a-z]+') mass_nouns = mass_noun_corpora.words() QUANTITY_POS_TAGS = frozenset(( POS.JJ, POS.VBN, POS.VBP, POS.NN, POS.NNP, POS.RB, POS.RBR, POS.RBS, )) bad_words_corpora = WordListCorpusReader('wordlist/shutterstock-bad-words', r'[a-z]{2,3}') bad_words_en = bad_words_corpora.words('en')
for l in locs: file.write(l + "\n") file.close() file = open(nepath + "\\orgs.txt", "w") for o in orgs: file.write(o + "\n") file.close() # extracting the new data into corpora fullnames = WordListCorpusReader(nepath, ['names.txt']) locs = WordListCorpusReader(nepath, ['locs.txt']) orgs = WordListCorpusReader(nepath, ['orgs.txt']) # get the words from the named entity corpora created earlier names = names.words() titles = titles.words() fullnames = fullnames.words() locations = locs.words() organisations = orgs.words() orgsuffs = orgsuffs.words() daymonths = daymonths.words() # extracting named entities (proper nouns) # if a proper noun is found, add it to the list # adjacent proper nouns are joined together def findPropers(words): propers = [] last = False
words = [word for word in tokens_nopunct if word.isalpha()] # Remove stopwords from the tokens words_nostops = [ word for word in words if word not in english_stops_nopunct ] # Stem the words porter = PorterStemmer() stems = [porter.stem(word) for word in words_nostops] # Create a frequency distribution from the samples (words) freqdist = FreqDist(stems) # Create a dict with the frequency of the insect words only insect_freq = {word: freqdist.freq(word) for word in insect_words.words()} # Add the year from the file name to the dict year = re.findall(r'\d{4}', file) insect_freq['year'] = year[0] # Add the results from this file to the total results freq_data.append(insect_freq) count += 1 # Create a Pandas DataFrame df = pd.DataFrame(freq_data) print('PLOTTING...')
x = nltk.data.load('big.txt', format='auto') ''' reader = WordListCorpusReader('',['wordlist.txt','wordlist2.txt']) print(reader.words()) print(reader.fileids()) stemmer = PorterStemmer() print(stemmer.stem('running')) wnl = WordNetLemmatizer() print(wnl.lemmatize('dogs')) ''' csReader = WordListCorpusReader('','computerscience.txt') wnl = WordNetLemmatizer() stemmer = PorterStemmer() ### Concats list of words from reader csWords = nltk.word_tokenize(' '.join(csReader.words())) print(type(csWords)) for word in csWords: print("%s,%s" % (wnl.lemmatize(word), stemmer.stem(word))) inputList = ['16/12/2016'] for inputString in inputList: print(re.findall(r'(.*?)[\s\-\\](.*?)[\s\-\\](.*?)', inputString))
if p_score > n_score: return "positive" elif n_score > p_score: return "negative" else: return "neutral" reader = WordListCorpusReader('/path/to/sentiment/files', ['positive.txt', 'negative.txt']) pos_feats = [(dict([(word, True)]), 'positive') for word in reader.words('positive.txt')] neg_feats = [(dict([(word, True)]), 'negative') for word in reader.words('negative.txt')] train_feats = pos_feats + neg_feats classifier = NaiveBayesClassifier.train(train_feats) t = Twitter(auth=OAuth("TOKEN", "TOKEN_KEY", "CON_SECRET", "CON_SECRET_KEY")) connection = pymongo.Connection() db = connection.twitter mentions = db.mentions screen_names = ["YOUR_ACCOUNT", "YOUR_OTHER_ACCOUNT"] re_RT = re.compile(("(RT\s?@YOUR_ACCOUNT|" "RT\s?@YOUR_OTHER_ACCOUNT)"), re.UNICODE | re.IGNORECASE)
docWords = corpus.words(fileName) for word in docWords: #print(word) w = word.lower() if w in wordSet: # i could also use in the fd.inc approach here and it's proabably better-just showing another option. print(w + " is in " + fileName) counter+= 1 billCounts.append(counter) return billCounts from nltk.corpus.reader import WordListCorpusReader path = "/Volumes/Optibay-1TB/Dropbox/Content_Wilker/Gonzalez_Project/Gonzalez_Keywords" reader = WordListCorpusReader(path, ['crime.txt']) #make an nltk word list crime = reader.words() crime = [word.lower().strip() for word in crime] crimeSet = set([w.lower() for w in crime]) crimeCount = make_count(billsCorpora, crimeSet) fd = count_stems(billsCorpora) counter = 0 #lets look at 200 of the most popular items and there counts #you could use the csv writer methods or this which is kind of hacky mywordlist = numpy.asarray([billsCorpora.fileids(), crimeCount]) mywordlist[0][1] #name mywordlist[1][1] #count
path = os.path.expanduser('~/nltk_data') if not os.path.exists(path): os.mkdir(path) os.path.exists(path) import nltk.data #path in nltk.data.path print path ''' note that this should be a path in the Git_Workspace on D:\ ''' ''' load a sample wordlist ''' #import nltk.data nltk.data.load('corpora/cookbook/GL_Sequent.txt', format='raw') 'nltk\n' from nltk.corpus.reader import WordListCorpusReader reader = WordListCorpusReader(path + '/corpora/cookbook/', ['GL_Sequent.txt']) reader.words() ''' reading a tagged corpus ''' from nltk.corpus.reader import TaggedCorpusReader reader = TaggedCorpusReader(path + '/corpora/cookbook/', r'.*\.pos') reader.words() reader.tagged_words() reader.sents() reader.tagged_sents() reader.paras() reader.tagged_paras() ''' different Tokenizer - works? ''' from nltk.tokenize import SpaceTokenizer reader = TaggedCorpusReader(path + '/corpora/cookbook/', r'.*\.pos',word_tokenizer=SpaceTokenizer()) reader.words()
# Initialize constants NLTK_HOME = '/home/administrator/nltk_data' l_list = [] # cleaning, tokenizing, normalizing # Read the Corpus state_reader = WordListCorpusReader(NLTK_HOME, ['state_files.txt']) city_reader = WordListCorpusReader(NLTK_HOME, ['city_files.txt']) train_file = '/app/ai/train_file.txt' test_results_file = '/app/ai/test_city_results_file.txt' # Store the URLs in a list urls = ([(url,'city') for url in city_reader.words()]+ [(url,'state') for url in state_reader.words()] ) for url in list(urls): # Remove HTMLtabs after reading the URL raw = nltk.clean_html(urlopen(url[0]).read()) print 'Finished cleaning html for ', url[0] # Compute the frequency distribution of the words tokens=nltk.FreqDist(word_normalizer(word.lower() for word in wordpunct_tokenize(raw))) print 'Finished computing FD for ', url[0] l_list = l_list + [(geo_features(word),url[1]) for word in tokens.keys()[:10]] print 'Finished extracting feature for ', url[0] with open(train_file, 'w') as f: pickle.dump(l_list, f)
import nltk from nltk.corpus.reader import WordListCorpusReader reader = WordListCorpusReader('', ['computerscience.txt']) words = [nltk.word_tokenize(i) for i in reader.words()] from nltk.stem.porter import * stemmer = PorterStemmer() from nltk.stem import WordNetLemmatizer lemmer = WordNetLemmatizer() stemmed = [[stemmer.stem(y) for y in i] for i in words] lemmed = [[lemmer.lemmatize(y) for y in i] for i in words] print(stemmed)
import os import re import sys import json import nltk.test import abbreviations import portuguese_tagger_processor from sentilex import sentiLexPairRdd from nltk.corpus.reader import WordListCorpusReader __output_path = "result.json" stopwords = nltk.corpus.stopwords.words('portuguese') reader = WordListCorpusReader('.', ['symbols.txt']) symbols = reader.words() reader = WordListCorpusReader('.', ['positive_emoticons.txt']) positive_emoticons = reader.words() reader = WordListCorpusReader('.', ['negative_emoticons.txt']) negative_emoticons = reader.words() tweet_tokenizer = portuguese_tagger_processor.get_tweet_tokenizer() tagger = portuguese_tagger_processor.get_tagger() json_result = [] tweet_dict = {} def count_positive_emoticons(tokens): counter = 0 for emoticon in positive_emoticons: if emoticon in tokens: counter += 1
import nltk.data from nltk.corpus.reader import WordListCorpusReader from nltk.corpus import names from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer from nltk.corpus import treebank wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist']) print(wordlist.words()) print(wordlist.fileids()) print(names.fileids()) print(len(names.words('male.txt'))) reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged", r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.words('wsj_0001.pos')) print(reader.tagged_words('wsj_0001.pos')) print(reader.tagged_sents('wsj_0001.pos')) print(reader.tagged_paras('wsj_0001.pos')) print(reader.fileids()) print("\n") print(reader.tagged_words('wsj_0001.pos', tagset='universal')) print(treebank.tagged_words())
########## WORDLIST CORPUS READER ############### #Basic Corpus Reader from nltk.corpus.reader import WordListCorpusReader #List of a few thousand names organized by gender from nltk.corpus import names #List of english words from nltk.corpus import words nltkDir="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\" #nltkFile="mywords.txt" #source=nltkDir+nltkFile ### One File WordListCorpusReader reader=WordListCorpusReader(nltkDir,['wordlist.txt']) print reader.words() print reader.fileids() ### MultiFile WordListCorpusReader #To get the names of the files in the corpus use the "fileids" command names.fileids() print len(names.words('female.txt')) print len(names.words('female.txt')) words.fileids() print len(words.words('en-basic')) print len(words.words('en')) ###Chunked Corpus Reader