def conditional_freq_distrubution(): cfd = nltk.ConditionalFreqDist((target, fileid[:10]) for fileid in nps.fileids() for posts in nps.words(fileid) for target in ['sexy', 'guy'] if posts.lower().startswith(target)) cfd.plot()
def find_all(): from nltk.corpus import gutenberg, nps_chat """搜索已分词文本""" moby = nltk.Text(gutenberg.words('melville-moby_dick.txt')) moby.findall(r"<a> (<.*>) <man>") chat = nltk.Text(nps_chat.words()) chat.findall(r"<.*> <.*> <bro>") chat.findall(r"<l.*>{3,}")
def searchText(): moby = nltk.Text(gutenberg.words('melville-moby_dick.txt')) moby.findall(r"<a> (<.*>) <man>") chat = nltk.Text(nps_chat.words()) chat.findall(r"<.*> <.*> <bro>") chat.findall(r"<l.*>{3,}") hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned'])) hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
def searchTokenText(): from nltk.corpus import gutenberg, nps_chat moby = nltk.Text(gutenberg.words('melville-moby_dick.txt')) print moby.findall(r"<a> (<.*>) <man>") chat = nltk.Text(nps_chat.words()) print chat.findall(r"<.*> <.*> <bro>") print chat.findall(r"<l.*>{3,}") from nltk.corpus import brown hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned'])) hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
def generate_greeting_classifier_nps(): global greeting_classifier try: with open('greet_classifier.pickle', 'rb') as f: greeting_classifier = pickle.load(f) except FileNotFoundError: v = set([w.lower() for w in nps_chat.words()]) posts = nps_chat.xml_posts()[:5000] h = [ (sentence_features(s.text.lower(), v=v), s.get('class') if s.get('class') in ['Greet', 'Bye'] else 'Other') for s in posts ] generate_greeting_classifier(h) with open('greet_classifier.pickle', 'wb') as f: pickle.dump(greeting_classifier, f)
def lookupTagger(r, c): # r = range, c = corpus if (c == "brown"): fDist = ConditionalFreqDist(brownTW) freqDist = FreqDist(brown.words()) wordsR = freqDist.most_common(r) likely_tags = dict((word, fDist[word].max()) for (word, _) in wordsR) baseline_tagger = UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger("NN")) return baseline_tagger if (c == "chat"): fDist = ConditionalFreqDist(chatTW) freqDist = FreqDist(chat.words()) wordsR = freqDist.most_common(r) likely_tags = dict((word, fDist[word].max()) for (word, _) in wordsR) baseline_tagger = UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger("NN")) return baseline_tagger
def calculate_flags(): flagNumber = 0 tokens = nltk.word_tokenize(flagList) # TODO: using a list of flags to be determined, # iterate through posts to find instances of any flags cfd = nltk.ConditionalFreqDist((tokens, fileid[:10]) for fileid in nps.fileids() for posts in nps.words(fileid) for target in [tokens] #you need a check if len(samples) < 1 #you don't need to use a format specifier to get string length if posts.lower().startswith(str(target))) print("printing flagList " + str(tokens)) print("cfd values: " + str(cfd.keys())) #problem here with "max() arg is an empty sequence" if we try to .tabulate() cfd.tabulate(cumulative = True)
#!/usr/bin/python3 # coding: utf-8 from nltk.corpus import nps_chat ################################################################## ## 简单查看 print(type( nps_chat)) # <class 'nltk.corpus.reader.nps_chat.NPSChatCorpusReader'> print(len(nps_chat.fileids())) # 15 print( nps_chat.fileids() ) # ['10-19-20s_706posts.xml', '10-19-30s_705posts.xml', '10-19-40s_686posts.xml', '10-19-adults_706posts.xml', '10-24-40s_706posts.xml', '10-26-teens_706posts.xml', '11-06-adults_706posts.xml', '11-08-20s_705posts.xml', '11-08-40s_706posts.xml', '11-08-adults_705posts.xml', '11-08-teens_706posts.xml', '11-09-20s_706posts.xml', '11-09-40s_706posts.xml', '11-09-adults_706posts.xml', '11-09-teens_706posts.xml'] print(len(nps_chat.words('10-19-20s_706posts.xml'))) # 2829 print( nps_chat.words('10-19-20s_706posts.xml')[:10] ) # ['now', 'im', 'left', 'with', 'this', 'gay', 'name', ':P', 'PART', 'hey'] ################################################################## ## posts() chatroom = nps_chat.posts('10-19-20s_706posts.xml') print( chatroom[123] ) # ['i', 'do', "n't", 'want', 'hot', 'pics', 'of', 'a', 'female', ',', 'I', 'can', 'look', 'in', 'a', 'mirror', '.']
# Write a function novel10(text) that prints any word that appeared in the last 10% of a text that had not been encountered earlier. from nltk.corpus import nps_chat def novel10(text): # finds the point at which to cut the text cut = int(0.9 * len(text)) #cuts the text first_part, second_part = text[:cut], text[cut:] # makes a set of each part, leaving only unique words unique_words_first_part = set(first_part) unique_words_second_part = set(second_part) # makes a new list of words that only appear in the last 10% return [ word for word in unique_words_second_part if word not in unique_words_first_part ] text = nps_chat.words() print(novel10(text))
import re from random import shuffle from nltk.corpus import webtext from nltk.corpus import nps_chat from gensim.models.doc2vec import LabeledSentence, Doc2Vec gendered_terms = [ r'\bhe\b', r'\bhes', r'\bshe\b', r'\bshes\b', r'\bhis\b', r'\bher\b', r'\bbro\b', r'\bman\b', r'\bsir\b', r'\bdude\b', r'\bgirl\b', r'\bgirls\b', r'\blady\b', r'\bgurl\b', r'\bhims\b', r'\bhers\b', r'\bhisself\b', r'\bherself\b', r'\bman\b', r'\bwoman\b' ] dictionary_words = {} for x in nps_chat.words() + webtext.words(): dictionary_words[x] = True print(len(dictionary_words)) class LabeledLineSentence(object): def __init__(self, messages_dic, is_sample=True): self.documents = [] self.messages_dic = messages_dic self.is_sample = is_sample def __iter__(self): for user in self.messages_dic: if self.is_sample: for i in range(200):
import os, sys, re from nltk.corpus import brown from nltk.corpus import cess_cat from nltk.corpus import nps_chat from nltk.probability import LidstoneProbDist from nltk.model import NgramModel from nltk.tokenize import word_tokenize, wordpunct_tokenize # Tokenizer from nltk.tokenize import RegexpTokenizer if __name__ == "__main__": urlRegex = '(http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?' specRegex = "([#@]+[\w']+)" symbolsRegex = '[\^=<>.,!?:;\(\)_\"]+' simpleWordRegex = "[\w'-]+" tTwit = list(nps_chat.words()) # estimator for smoothing the N-gram model estimator1 = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) tokens1 = list(brown.words()) # N-gram language model with 3-grams model = NgramModel(3, tokens1, estimator=estimator1) twitsFile = sys.argv[1] varsFile = sys.argv[2] outFile = sys.argv[3] outTwitFile = sys.argv[4] mode = sys.argv[5]
import nltk import time nltk.download('nps_chat') from nltk.corpus import nps_chat for i in nps_chat.words(): print("Raw word: " + i) token = nltk.word_tokenize(i)[0] print("Token: " + token) print("---") time.sleep(.5)
'English: Brown Corpus': lambda: brown.words(), 'English: Brown Corpus (Press)': lambda: brown.words(categories=['news', 'editorial', 'reviews']), 'English: Brown Corpus (Religion)': lambda: brown.words(categories='religion'), 'English: Brown Corpus (Learned)': lambda: brown.words(categories='learned'), 'English: Brown Corpus (Science Fiction)': lambda: brown.words(categories='science_fiction'), 'English: Brown Corpus (Romance)': lambda: brown.words(categories='romance'), 'English: Brown Corpus (Humor)': lambda: brown.words(categories='humor'), 'English: NPS Chat Corpus': lambda: nps_chat.words(), 'English: Wall Street Journal Corpus': lambda: treebank.words(), 'Chinese: Sinica Corpus': lambda: sinica_treebank.words(), 'Dutch: Alpino Corpus': lambda: alpino.words(), 'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(), 'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(), 'Spanish: CESS-ESP Corpus':
cv_word_pairs = [(cv,w) for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]',w)] cv_index = nltk.Index(cv_word_pairs) print(cv_index['su']) print(cv_index['po']) import re import nltk def stem(word): for suffix in ['ing','ly','ed','ious','ies','ive','es','s','ment']: if word.endswith(suffix): return word[:-len(suffix)] return word print(re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$','processing')) def stem(word): regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$' stem,suffix = re.findall(regexp,word)[0] return stem from nltk.corpus import gutenberg,nps_chat import nltk moby = nltk.Text(gutenberg.words('melville-moby_dick.txt')) print(moby.findall(r"<a> (<.*>) <man>")) chat = nltk.Text(nps_chat.words()) print(chat.findall(r"<.*><.*><bro>")) print(chat.findall(r"<1.*>{3,}"))
def classify_greeting(s): v = set([w.lower() for w in nps_chat.words()]) return greeting_classifier.classify(sentence_features(s.lower(), v=v))
from nltk.corpus import nps_chat from nltk.corpus import brown from nltk.corpus import names from normalise.data.contraction_list import contractions from normalise.data.tech_words import tech_words mod_path = os.path.dirname(__file__) with open('{}/data/wordlist.pickle'.format(mod_path), mode='rb') as file: wordlist = pickle.load(file) with open('{}/data/fake_data.pickle'.format(mod_path), mode='rb') as file: fake_data = pickle.load(file) if __name__ == '__main__': word_tokenized = brown.words() + nps_chat.words() + fake_data brown_lower = {w.lower() for w in brown.words() if len(w) > 4 and w.isalpha()} names_lower = {w.lower() for w in names.words()} words_lower = {w.lower() for w in words.words('en') if len(w) > 1} wordlist = brown_lower | names_lower | words_lower | set(tech_words) | {'I', 'i', 'a', 'A'} word_tokenized_lowered = [w.lower() if w.lower() in wordlist else w for w in word_tokenized] word_tokenized = list(word_tokenized) # Conditions for identification of NSWs. def cond1(w): """ Return word if its lower-cased form is not in the wordlist.""" return w.lower() not in wordlist or w == 'US'
R0=random.getstate()[1][(long(test_seed%10000))%625] random.shuffle(news_words, lambda: 1/R0) # deterministic shuffling using seeds test_seed/=10000 filtered_news_words = list(set(news_words)-stop) [related_words_NEWS.append(i) for i in filtered_news_words if len(i)>3] related_words_NEWS = related_words_NEWS[0:2000] print "finished extracted news data...\n" #------------------------------------------------------------ #--------------------------- CHAT --------------------------- print "starting extracted chat data...\n" if CHAT_FLAG != 'n': chat_words = list(chat.words()) R1=random.getstate()[1][(long(test_seed%10000))%625] random.shuffle(chat_words, lambda: 1/R1) # deterministic shuffling using seeds test_seed/=10000 filtered_chat_words = list(set(chat_words)-stop) [related_words_CHAT.append(i) for i in filtered_chat_words] related_words_CHAT = related_words_CHAT[0:2000] print "finished extracted chat data...\n" #------------------------------------------------------------ #if BOOK_FLAG != 'n': # book_words = gut.words(gut.fileids()[int(shakespeare_books.get(str(FAV_BOOK)))])
CORPUS_LOADED_EVENT = "<<CL_EVENT>>" ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>" POLL_INTERVAL = 100 _DEFAULT = "English: Brown Corpus (Humor)" _CORPORA = { "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(), "English: Brown Corpus": lambda: brown.words(), "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]), "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self):
#using regex wordlist_suffixes = [ suffix for w in wordlist for suffix in re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', w) ] print(nltk.FreqDist(wordlist_suffixes).most_common(20)) raw = """DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government. Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony.""" raw_tokens = word_tokenize(raw) raw_stems = [stem(t) for t in raw_tokens] print(raw_stems) #searching tokenized text moby = nltk.Text(gutenberg.words('melville-moby_dick.txt')) print(moby.findall(r'<a><man>')) #print only a man print(moby.findall(r'<a>(<.*>)<man>')) #prints words between a and man chat_words = nltk.Text(nps_chat.words()) print(chat_words.findall(r'<.*><.*><bro>')) print(chat_words.findall(r'<1.*>{3,}')) #discover hypernyms in text i.e a and other ys hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned'])) print(hobbies_learned.findall(r'<\w*><and><other><\w*s>')) print(hobbies_learned.findall(r'<\w*><as><\w*>')) #text normalization #stemmers - to remove affixes from words, 2 off-the-shelf in nltk 1.PorterStemmer 2.LancasterStemmer print(raw_tokens) porter = nltk.PorterStemmer() lancaster = nltk.LancasterStemmer() print([porter.stem(w) for w in raw_tokens]) print([lancaster.stem(w) for w in raw_tokens])
categorized_sentences = pickle.load(f) f.close() except FileNotFoundError: categorized_sentences = [] # load up categorized sentences if found try: f = open('sentence_clusters.pickle', 'rb') sentence_clusters= pickle.load(f) f.close() except FileNotFoundError: sentence_clusters = [] # preprocessing nps chat corpus for sentence classification all_words = nltk.FreqDist(w.lower() for w in nps_chat.words()) word_features = [a[0] for a in all_words.most_common()[:2000]] sentences = [(nltk.word_tokenize(a.text.lower()), a.attrib['class']) for a in nps_chat.xml_posts()] # logical response types for each input sentence type response_types = { 'Accept': ['Statement', 'Emotion', 'Emphasis'], 'Bye': ['Bye'], 'Clarify': ['Accept', 'Reject', 'Statement', 'Emphasis'], 'Emotion': ['Accept', 'Reject', 'Statement', 'Emotion', 'Emphasis'], 'Continuer': ['Accept', 'Reject', 'Statement', 'Emphasis'], 'Emphasis': ['Accept', 'Reject', 'Statement', 'Emotion', 'Emphasis'], 'Greet': ['Greet'], 'Other': ['Statement'], 'Reject': ['Statement', 'Emotion', 'Emphasis'], 'Statement': ['Accept', 'Reject', 'Statement', 'Emotion', 'Emphasis'],
# Write a function novel10(text) that prints any word that appeared in the last 10% of a text that had not been encountered earlier. from nltk.corpus import nps_chat def novel10(text): # finds the point at which to cut the text cut = int(0.9 * len(text)) #cuts the text first_part, second_part = text[:cut], text[cut:] # makes a set of each part, leaving only unique words unique_words_first_part = set(first_part) unique_words_second_part = set(second_part) # makes a new list of words that only appear in the last 10% return [word for word in unique_words_second_part if word not in unique_words_first_part] text = nps_chat.words() print(novel10(text))
_DEFAULT = "English: Brown Corpus (Humor)" _CORPORA = { "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(), "English: Brown Corpus": lambda: brown.words(), "English: Brown Corpus (Press)": lambda: brown.words( categories=["news", "editorial", "reviews"] ), "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words( categories="science_fiction" ), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self):
from nltk.corpus import nps_chat from nltk import FreqDist first_n = 100 a_fq = FreqDist(nps_chat.words('11-09-adults_706posts.xml')) t_fq = FreqDist(nps_chat.words('11-09-teens_706posts.xml')) a_words = set(a_fq.keys()[:first_n]) t_words = set(t_fq.keys()[:first_n]) print "common used words:" print ','.join( a_words.intersection(t_words) ) print print "adult use while teens not use:" print ','.join( a_words - t_words ) print print "teen use while adult not use:" print ','.join( t_words - a_words )
from nltk.corpus import gutenberg, nps_chat import nltk moby = nltk.Text(gutenberg.words("melville-moby_dick.txt")) #findall - text class - regular expression print(moby.findall(r'<a><.*><man>')) chat_obj = nltk.Text(nps_chat.words()) print(chat_obj.findall(r'<.*><.*><bro>')) print(chat_obj.findall(r'<a><.*><man>')) text = "Hello, I am a computer programmer who is currently learning and studying NLP" our_own_text_obj = nltk.Text(nltk.word_tokenize(text)) print(our_own_text_obj.findall(r'<.*ing>+'))
regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$' stem, suffix = re.findall(regexp, word)[0] return stem raw = """DENNIS: Listen, strange woman lying in ponds distributing swords is no basis for a system of government. Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony.""" tokens = word_tokenize(raw) print([stem(t) for t in tokens]) print("-" * 40) print("Searching Tokenized Text") from nltk.corpus import gutenberg, nps_chat moby = nltk.Text(gutenberg.words('melville-moby_dick.txt')) moby.findall(r'<a>(<.*>)<man>') chat = nltk.Text(nps_chat.words()) chat.findall(r'<.*> <.*> <bro>') chat.findall(r'<l.*>{3,}') print("-" * 40) nltk.re_show('kaa', ' '.join(rotokas_words)) nltk.app.nemo() print("-" * 40) from nltk.corpus import brown hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned'])) hobbies_learned.findall(r'<\w*> <and> <other> <\w*s>') print("-" * 40) hobbies_learned.findall(r'<as> <\w*> <as> <\w*>') print("-" * 40)
import nltk from nltk.tokenize import TweetTokenizer from nltk.corpus import nps_chat as nps import os # twitterSamples = nltk.corpus.twitter_samples # negTweets = twitter_samples.strings('negative_tweets.json') teenChat = nps.xml_posts("11-08-teens_706posts.xml") chatWords = nps.words("11-08-teens_706posts.xml") chatBigrams = nltk.bigrams(chatWords) cfd = nltk.ConditionalFreqDist(chatBigrams) maxConfidence = 100 flagFile = open('flagList.txt') flagList = flagFile.read() def calculate_flags(): flagNumber = 0 tokens = nltk.word_tokenize(flagList) # TODO: using a list of flags to be determined, # iterate through posts to find instances of any flags cfd = nltk.ConditionalFreqDist((tokens, fileid[:10]) for fileid in nps.fileids() for posts in nps.words(fileid) for target in [tokens] #you need a check if len(samples) < 1
print("Type the name of the text or sentence to view it.") print("Type: 'texts()' or 'sents()' to list the materials.") text1 = Text(gutenberg.words('melville-moby_dick.txt')) print("text1:", text1.name) text2 = Text(gutenberg.words('austen-sense.txt')) print("text2:", text2.name) text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis") print("text3:", text3.name) text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print("text4:", text4.name) text5 = Text(nps_chat.words(), name="Chat Corpus") print("text5:", text5.name) text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail") print("text6:", text6.name) text7 = Text(treebank.words(), name="Wall Street Journal") print("text7:", text7.name) text8 = Text(webtext.words('singles.txt'), name="Personals Corpus") print("text8:", text8.name) text9 = Text(gutenberg.words('chesterton-thursday.txt')) print("text9:", text9.name) def texts():
cfd = nltk.ConditionalFreqDist(bigrams) print(cfd['living']) genrate_model(cfd, 'living') #lexical resources - wordlist with info such as lexical resources, sense definition etc. #unusual words def unusual_words(text): text_vocab = set(w.lower() for w in text if w.isalpha()) english_vocab = set(w.lower() for w in nltk.corpus.words.words()) unusual = text_vocab - english_vocab return sorted(unusual) unusual_words(gutenberg.words('austen-sense.txt')) unusual_words(nps_chat.words()) #stop words such as if the for etc. print(stopwords.words('english')) #function to compute what % of words are not is stopwords list def content_fraction(text): stopwords_list = stopwords.words('english') content = [w for w in text if w.lower() not in stopwords_list] return len(content) / len(text) * 100 content_fraction(reuters.words()) #solving word puzzle puzzle_letters = nltk.FreqDist('egivrvonl') obligatory = 'r'
#These libraries are for cleaning text data #re is the Python library for Regular Expressions import re #nltk is the python library for Natural Language Processing (used here for cleaning non-English text from the data) from nltk.corpus import brown from nltk.corpus import words from nltk.corpus import cess_esp as spanish from nltk.corpus import reuters from nltk.corpus import nps_chat #These dictionaries are used to reduce time required to search for English words by implementing a hash search in "isEnglishWord" englishBrownDict = dict.fromkeys(brown.words(), True) englishWordsDict = dict.fromkeys(words.words(), True) englishReutersDict = dict.fromkeys(reuters.words(), True) englishChatDict = dict.fromkeys(nps_chat.words(), True) spanishWordsDict = dict.fromkeys(spanish.words(), True) malayText = open(os.path.join(os.getcwd(), "malayUpdated.txt")) malayWordsDict = [] for line in malayText: malayWordsDict.append(line) #print "Count of malay words: ", len (malayWords), "\n" #malayWordsDict = dict.fromkeys (malayWords, True) commonTweetWords = [ "ur", "u", "youre", "gonna", "wanna", "wannabe", "shoulda", "should've",
print("Type the name of the text or sentence to view it.") print("Type: 'texts()' or 'sents()' to list the materials.") text1 = Text(gutenberg.words("melville-moby_dick.txt")) print("text1:", text1.name) text2 = Text(gutenberg.words("austen-sense.txt")) print("text2:", text2.name) text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis") print("text3:", text3.name) text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print("text4:", text4.name) text5 = Text(nps_chat.words(), name="Chat Corpus") print("text5:", text5.name) text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail") print("text6:", text6.name) text7 = Text(treebank.words(), name="Wall Street Journal") print("text7:", text7.name) text8 = Text(webtext.words("singles.txt"), name="Personals Corpus") print("text8:", text8.name) text9 = Text(gutenberg.words("chesterton-thursday.txt")) print("text9:", text9.name)
# global list of gold corpora # C:\Users\admin\AppData\Roaming\nltk_data\corpora\ corp_names = [ "brown", "nps_chat", "conll2000", "treebank", "twitter", "nhtsa_0", "nhtsa_1", "nhtsa_2", "nhtsa_3", "nhtsa_4", "nhtsa_5", "nhtsa_6" ] corp_words_tagged = [ brown.tagged_words(tagset=CONST_tagset), nps_chat.tagged_words(tagset=CONST_tagset), conll2000.tagged_words(tagset=CONST_tagset), treebank.tagged_words(tagset=CONST_tagset) ] corp_words_untagged = [ brown.words(), nps_chat.words(), conll2000.words(), treebank.words() ] corp_sents_tagged = [ brown.tagged_sents(tagset=CONST_tagset), nps_chat.tagged_posts(tagset=CONST_tagset), conll2000.tagged_sents(tagset=CONST_tagset), treebank.tagged_sents(tagset=CONST_tagset) ] corp_sents_untagged = [ brown.sents(), nps_chat.posts(), conll2000.sents(), treebank.sents() ]