Пример #1
0
def conditional_freq_distrubution():
    cfd = nltk.ConditionalFreqDist((target, fileid[:10])
        for fileid in nps.fileids()
        for posts in nps.words(fileid)
        for target in ['sexy', 'guy']
        if posts.lower().startswith(target))
    cfd.plot()
Пример #2
0
def find_all():
    from nltk.corpus import gutenberg, nps_chat
    """搜索已分词文本"""
    moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
    moby.findall(r"<a> (<.*>) <man>")
    chat = nltk.Text(nps_chat.words())
    chat.findall(r"<.*> <.*> <bro>")
    chat.findall(r"<l.*>{3,}")
Пример #3
0
def searchText():

    moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
    moby.findall(r"<a> (<.*>) <man>")
    chat = nltk.Text(nps_chat.words())
    chat.findall(r"<.*> <.*> <bro>") 
    chat.findall(r"<l.*>{3,}") 

    hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
    hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
Пример #4
0
def searchText():

    moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
    moby.findall(r"<a> (<.*>) <man>")
    chat = nltk.Text(nps_chat.words())
    chat.findall(r"<.*> <.*> <bro>")
    chat.findall(r"<l.*>{3,}")

    hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
    hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
Пример #5
0
def searchTokenText():
    from nltk.corpus import gutenberg, nps_chat
    moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
    print moby.findall(r"<a> (<.*>) <man>")

    chat = nltk.Text(nps_chat.words())
    print chat.findall(r"<.*> <.*> <bro>")

    print chat.findall(r"<l.*>{3,}")

    from nltk.corpus import brown
    hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
    hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
Пример #6
0
def generate_greeting_classifier_nps():
    global greeting_classifier
    try:
        with open('greet_classifier.pickle', 'rb') as f:
            greeting_classifier = pickle.load(f)
    except FileNotFoundError:
        v = set([w.lower() for w in nps_chat.words()])
        posts = nps_chat.xml_posts()[:5000]
        h = [
            (sentence_features(s.text.lower(), v=v),
             s.get('class') if s.get('class') in ['Greet', 'Bye'] else 'Other')
            for s in posts
        ]
        generate_greeting_classifier(h)
        with open('greet_classifier.pickle', 'wb') as f:
            pickle.dump(greeting_classifier, f)
Пример #7
0
def lookupTagger(r, c):  # r = range, c = corpus
    if (c == "brown"):
        fDist = ConditionalFreqDist(brownTW)
        freqDist = FreqDist(brown.words())
        wordsR = freqDist.most_common(r)
        likely_tags = dict((word, fDist[word].max()) for (word, _) in wordsR)
        baseline_tagger = UnigramTagger(model=likely_tags,
                                        backoff=nltk.DefaultTagger("NN"))
        return baseline_tagger
    if (c == "chat"):
        fDist = ConditionalFreqDist(chatTW)
        freqDist = FreqDist(chat.words())
        wordsR = freqDist.most_common(r)
        likely_tags = dict((word, fDist[word].max()) for (word, _) in wordsR)
        baseline_tagger = UnigramTagger(model=likely_tags,
                                        backoff=nltk.DefaultTagger("NN"))
        return baseline_tagger
Пример #8
0
def calculate_flags():
    flagNumber = 0
    tokens = nltk.word_tokenize(flagList)

    # TODO: using a list of flags to be determined,
    # iterate through posts to find instances of any flags
    cfd = nltk.ConditionalFreqDist((tokens, fileid[:10])
        for fileid in nps.fileids()
        for posts in nps.words(fileid)
        for target in [tokens]
        #you need a check if len(samples) < 1
        #you don't need to use a format specifier to get string length
        if posts.lower().startswith(str(target)))
    print("printing flagList " + str(tokens))
    print("cfd values: " + str(cfd.keys()))


    #problem here with "max() arg is an empty sequence" if we try to .tabulate()
    cfd.tabulate(cumulative = True)
Пример #9
0
#!/usr/bin/python3
# coding: utf-8
from nltk.corpus import nps_chat
##################################################################
## 简单查看
print(type(
    nps_chat))  # <class 'nltk.corpus.reader.nps_chat.NPSChatCorpusReader'>
print(len(nps_chat.fileids()))  # 15
print(
    nps_chat.fileids()
)  # ['10-19-20s_706posts.xml', '10-19-30s_705posts.xml', '10-19-40s_686posts.xml', '10-19-adults_706posts.xml', '10-24-40s_706posts.xml', '10-26-teens_706posts.xml', '11-06-adults_706posts.xml', '11-08-20s_705posts.xml', '11-08-40s_706posts.xml', '11-08-adults_705posts.xml', '11-08-teens_706posts.xml', '11-09-20s_706posts.xml', '11-09-40s_706posts.xml', '11-09-adults_706posts.xml', '11-09-teens_706posts.xml']
print(len(nps_chat.words('10-19-20s_706posts.xml')))  # 2829
print(
    nps_chat.words('10-19-20s_706posts.xml')[:10]
)  # ['now', 'im', 'left', 'with', 'this', 'gay', 'name', ':P', 'PART', 'hey']
##################################################################
## posts()
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print(
    chatroom[123]
)  # ['i', 'do', "n't", 'want', 'hot', 'pics', 'of', 'a', 'female', ',', 'I', 'can', 'look', 'in', 'a', 'mirror', '.']
Пример #10
0
Файл: 14.py Проект: shubh29/nltk
# Write a function novel10(text) that prints any word that appeared in the last 10% of a text that had not been encountered earlier.

from nltk.corpus import nps_chat


def novel10(text):
    # finds the point at which to cut the text
    cut = int(0.9 * len(text))
    #cuts the text
    first_part, second_part = text[:cut], text[cut:]
    # makes a set of each part, leaving only unique words
    unique_words_first_part = set(first_part)
    unique_words_second_part = set(second_part)

    # makes a new list of words that only appear in the last 10%
    return [
        word for word in unique_words_second_part
        if word not in unique_words_first_part
    ]


text = nps_chat.words()

print(novel10(text))
Пример #11
0
import re
from random import shuffle
from nltk.corpus import webtext
from nltk.corpus import nps_chat

from gensim.models.doc2vec import LabeledSentence, Doc2Vec

gendered_terms = [
    r'\bhe\b', r'\bhes', r'\bshe\b', r'\bshes\b', r'\bhis\b', r'\bher\b',
    r'\bbro\b', r'\bman\b', r'\bsir\b', r'\bdude\b', r'\bgirl\b', r'\bgirls\b',
    r'\blady\b', r'\bgurl\b', r'\bhims\b', r'\bhers\b', r'\bhisself\b',
    r'\bherself\b', r'\bman\b', r'\bwoman\b'
]

dictionary_words = {}
for x in nps_chat.words() + webtext.words():
    dictionary_words[x] = True

print(len(dictionary_words))


class LabeledLineSentence(object):
    def __init__(self, messages_dic, is_sample=True):
        self.documents = []
        self.messages_dic = messages_dic
        self.is_sample = is_sample

    def __iter__(self):
        for user in self.messages_dic:
            if self.is_sample:
                for i in range(200):
Пример #12
0
import os, sys, re
from nltk.corpus import brown
from nltk.corpus import cess_cat
from nltk.corpus import nps_chat
from nltk.probability import LidstoneProbDist
from nltk.model import NgramModel
from nltk.tokenize import word_tokenize, wordpunct_tokenize # Tokenizer
from nltk.tokenize import RegexpTokenizer

if __name__ == "__main__":
    urlRegex = '(http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?'
    specRegex = "([#@]+[\w']+)"
    symbolsRegex = '[\^=<>.,!?:;\(\)_\"]+'
    simpleWordRegex = "[\w'-]+"

    tTwit = list(nps_chat.words())

    # estimator for smoothing the N-gram model
    estimator1 = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

    tokens1 = list(brown.words())

    # N-gram language model with 3-grams
    model = NgramModel(3, tokens1, estimator=estimator1)

    twitsFile = sys.argv[1]
    varsFile = sys.argv[2]
    outFile = sys.argv[3]
    outTwitFile = sys.argv[4]
    mode = sys.argv[5]
import nltk
import time

nltk.download('nps_chat')
from nltk.corpus import nps_chat

for i in nps_chat.words():
    print("Raw word: " + i)
    token = nltk.word_tokenize(i)[0]
    print("Token: " + token)
    print("---")
    time.sleep(.5)
Пример #14
0
 'English: Brown Corpus':
     lambda: brown.words(),
 'English: Brown Corpus (Press)':
     lambda: brown.words(categories=['news', 'editorial', 'reviews']),
 'English: Brown Corpus (Religion)':
     lambda: brown.words(categories='religion'),
 'English: Brown Corpus (Learned)':
     lambda: brown.words(categories='learned'),
 'English: Brown Corpus (Science Fiction)':
     lambda: brown.words(categories='science_fiction'),
 'English: Brown Corpus (Romance)':
     lambda: brown.words(categories='romance'),
 'English: Brown Corpus (Humor)':
     lambda: brown.words(categories='humor'),
 'English: NPS Chat Corpus':
     lambda: nps_chat.words(),
 'English: Wall Street Journal Corpus':
     lambda: treebank.words(),
 'Chinese: Sinica Corpus':
     lambda: sinica_treebank.words(),
 'Dutch: Alpino Corpus':
     lambda: alpino.words(),
 'Hindi: Indian Languages Corpus':
     lambda: indian.words(files='hindi.pos'),
 'Portuguese: Floresta Corpus (Portugal)':
     lambda: floresta.words(),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
     lambda: mac_morpho.words(),
 'Portuguese: Machado Corpus (Brazil)':
     lambda: machado.words(),
 'Spanish: CESS-ESP Corpus':
Пример #15
0
cv_word_pairs = [(cv,w) for w in rotokas_words
                        for cv in re.findall(r'[ptksvr][aeiou]',w)]
cv_index = nltk.Index(cv_word_pairs)
print(cv_index['su'])
print(cv_index['po'])

import re
import nltk
def stem(word):
    for suffix in ['ing','ly','ed','ious','ies','ive','es','s','ment']:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word
print(re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$','processing'))

def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem,suffix = re.findall(regexp,word)[0]
    return stem


from nltk.corpus import gutenberg,nps_chat
import nltk
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
print(moby.findall(r"<a> (<.*>) <man>"))
chat = nltk.Text(nps_chat.words())
print(chat.findall(r"<.*><.*><bro>"))
print(chat.findall(r"<1.*>{3,}"))

Пример #16
0
def classify_greeting(s):
    v = set([w.lower() for w in nps_chat.words()])
    return greeting_classifier.classify(sentence_features(s.lower(), v=v))
Пример #17
0
from nltk.corpus import nps_chat
from nltk.corpus import brown
from nltk.corpus import names
from normalise.data.contraction_list import contractions
from normalise.data.tech_words import tech_words

mod_path = os.path.dirname(__file__)

with open('{}/data/wordlist.pickle'.format(mod_path), mode='rb') as file:
    wordlist = pickle.load(file)

with open('{}/data/fake_data.pickle'.format(mod_path), mode='rb') as file:
    fake_data = pickle.load(file)

if __name__ == '__main__':
    word_tokenized = brown.words() + nps_chat.words() + fake_data
    brown_lower = {w.lower() for w in brown.words()
                   if len(w) > 4 and w.isalpha()}
    names_lower = {w.lower() for w in names.words()}
    words_lower = {w.lower() for w in words.words('en') if len(w) > 1}
    wordlist = brown_lower | names_lower | words_lower | set(tech_words) | {'I', 'i', 'a', 'A'}
    word_tokenized_lowered = [w.lower() if w.lower() in wordlist
                              else w for w in word_tokenized]
    word_tokenized = list(word_tokenized)


# Conditions for identification of NSWs.
def cond1(w):
    """ Return word if its lower-cased form is not in the wordlist."""
    return w.lower() not in wordlist or w == 'US'
Пример #18
0
   		R0=random.getstate()[1][(long(test_seed%10000))%625]
   		random.shuffle(news_words, lambda: 1/R0)        # deterministic shuffling using seeds
   		test_seed/=10000   
   		filtered_news_words = list(set(news_words)-stop)
   		[related_words_NEWS.append(i) for i in filtered_news_words if len(i)>3]
   		related_words_NEWS = related_words_NEWS[0:2000]
print "finished extracted news data...\n"

#------------------------------------------------------------
   
   
#--------------------------- CHAT ---------------------------
print "starting extracted chat data...\n"

if CHAT_FLAG != 'n':
   		chat_words = list(chat.words())
  
   		R1=random.getstate()[1][(long(test_seed%10000))%625]
   		random.shuffle(chat_words, lambda: 1/R1)       # deterministic shuffling using seeds
   		test_seed/=10000
   
   		filtered_chat_words = list(set(chat_words)-stop)
   		[related_words_CHAT.append(i) for i in filtered_chat_words]
   		related_words_CHAT = related_words_CHAT[0:2000]
print "finished extracted chat data...\n"
    

#------------------------------------------------------------

#if BOOK_FLAG != 'n':
#    book_words = gut.words(gut.fileids()[int(shakespeare_books.get(str(FAV_BOOK)))])
Пример #19
0
CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
POLL_INTERVAL = 100

_DEFAULT = "English: Brown Corpus (Humor)"
_CORPORA = {
    "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
    "English: Brown Corpus": lambda: brown.words(),
    "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
Пример #20
0
#using regex
wordlist_suffixes = [
    suffix for w in wordlist
    for suffix in re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', w)
]
print(nltk.FreqDist(wordlist_suffixes).most_common(20))
raw = """DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government.  Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony."""
raw_tokens = word_tokenize(raw)
raw_stems = [stem(t) for t in raw_tokens]
print(raw_stems)

#searching tokenized text
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
print(moby.findall(r'<a><man>'))  #print only a man
print(moby.findall(r'<a>(<.*>)<man>'))  #prints words between a and man
chat_words = nltk.Text(nps_chat.words())
print(chat_words.findall(r'<.*><.*><bro>'))
print(chat_words.findall(r'<1.*>{3,}'))
#discover hypernyms in text i.e a and other ys
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
print(hobbies_learned.findall(r'<\w*><and><other><\w*s>'))
print(hobbies_learned.findall(r'<\w*><as><\w*>'))

#text normalization
#stemmers - to remove affixes from words, 2 off-the-shelf in nltk 1.PorterStemmer 2.LancasterStemmer
print(raw_tokens)
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
print([porter.stem(w) for w in raw_tokens])
print([lancaster.stem(w) for w in raw_tokens])
Пример #21
0
    categorized_sentences = pickle.load(f)
    f.close()
except FileNotFoundError:
    categorized_sentences = []

# load up categorized sentences if found
try:
    f = open('sentence_clusters.pickle', 'rb')
    sentence_clusters= pickle.load(f)
    f.close()
except FileNotFoundError:
    sentence_clusters = []


# preprocessing nps chat corpus for sentence classification
all_words = nltk.FreqDist(w.lower() for w in nps_chat.words())
word_features = [a[0] for a in all_words.most_common()[:2000]]
sentences = [(nltk.word_tokenize(a.text.lower()), a.attrib['class']) for a in nps_chat.xml_posts()]

# logical response types for each input sentence type
response_types = {
    'Accept':       ['Statement', 'Emotion', 'Emphasis'],
    'Bye':          ['Bye'],
    'Clarify':      ['Accept', 'Reject', 'Statement', 'Emphasis'],
    'Emotion':      ['Accept', 'Reject', 'Statement', 'Emotion', 'Emphasis'],
    'Continuer':    ['Accept', 'Reject', 'Statement', 'Emphasis'],
    'Emphasis':     ['Accept', 'Reject', 'Statement', 'Emotion', 'Emphasis'],
    'Greet':        ['Greet'],
    'Other':        ['Statement'],
    'Reject':       ['Statement', 'Emotion', 'Emphasis'],
    'Statement':    ['Accept', 'Reject', 'Statement', 'Emotion', 'Emphasis'],
Пример #22
0
# Write a function novel10(text) that prints any word that appeared in the last 10% of a text that had not been encountered earlier.

from nltk.corpus import nps_chat



def novel10(text):
	# finds the point at which to cut the text
	cut = int(0.9 * len(text))
	#cuts the text
	first_part, second_part = text[:cut], text[cut:]
	# makes a set of each part, leaving only unique words
	unique_words_first_part = set(first_part)
	unique_words_second_part = set(second_part)

	# makes a new list of words that only appear in the last 10%
	return [word for word in unique_words_second_part if word not in unique_words_first_part]

text = nps_chat.words()

print(novel10(text))
Пример #23
0
_DEFAULT = "English: Brown Corpus (Humor)"
_CORPORA = {
    "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
    "English: Brown Corpus": lambda: brown.words(),
    "English: Brown Corpus (Press)": lambda: brown.words(
        categories=["news", "editorial", "reviews"]
    ),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(
        categories="science_fiction"
    ),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
Пример #24
0
from nltk.corpus import nps_chat
from nltk import FreqDist

first_n = 100

a_fq = FreqDist(nps_chat.words('11-09-adults_706posts.xml'))
t_fq = FreqDist(nps_chat.words('11-09-teens_706posts.xml'))

a_words = set(a_fq.keys()[:first_n])
t_words = set(t_fq.keys()[:first_n])

print "common used words:"
print ','.join( a_words.intersection(t_words) )
print 
print "adult use while teens not use:"
print ','.join( a_words - t_words )
print
print "teen use while adult not use:"
print ','.join( t_words - a_words )
Пример #25
0
from nltk.corpus import gutenberg, nps_chat
import nltk

moby = nltk.Text(gutenberg.words("melville-moby_dick.txt"))

#findall - text class - regular expression

print(moby.findall(r'<a><.*><man>'))
chat_obj = nltk.Text(nps_chat.words())
print(chat_obj.findall(r'<.*><.*><bro>'))
print(chat_obj.findall(r'<a><.*><man>'))

text = "Hello, I am a computer programmer who is currently learning and studying NLP"

our_own_text_obj = nltk.Text(nltk.word_tokenize(text))
print(our_own_text_obj.findall(r'<.*ing>+'))
Пример #26
0
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

raw = """DENNIS: Listen, strange woman lying in ponds distributing swords
is no basis for a system of government. Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)
print([stem(t) for t in tokens])
print("-" * 40)

print("Searching Tokenized Text")
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
moby.findall(r'<a>(<.*>)<man>')
chat = nltk.Text(nps_chat.words())
chat.findall(r'<.*> <.*> <bro>')
chat.findall(r'<l.*>{3,}')
print("-" * 40)

nltk.re_show('kaa', ' '.join(rotokas_words))
nltk.app.nemo()
print("-" * 40)

from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r'<\w*> <and> <other> <\w*s>')
print("-" * 40)

hobbies_learned.findall(r'<as> <\w*> <as> <\w*>')
print("-" * 40)
Пример #27
0
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import nps_chat as nps
import os


# twitterSamples = nltk.corpus.twitter_samples
# negTweets = twitter_samples.strings('negative_tweets.json')

teenChat = nps.xml_posts("11-08-teens_706posts.xml")
chatWords = nps.words("11-08-teens_706posts.xml")
chatBigrams = nltk.bigrams(chatWords)
cfd = nltk.ConditionalFreqDist(chatBigrams)
maxConfidence = 100
flagFile = open('flagList.txt')
flagList = flagFile.read()




def calculate_flags():
    flagNumber = 0
    tokens = nltk.word_tokenize(flagList)

    # TODO: using a list of flags to be determined,
    # iterate through posts to find instances of any flags
    cfd = nltk.ConditionalFreqDist((tokens, fileid[:10])
        for fileid in nps.fileids()
        for posts in nps.words(fileid)
        for target in [tokens]
        #you need a check if len(samples) < 1
Пример #28
0
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print("text1:", text1.name)

text2 = Text(gutenberg.words('austen-sense.txt'))
print("text2:", text2.name)

text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
print("text8:", text8.name)

text9 = Text(gutenberg.words('chesterton-thursday.txt'))
print("text9:", text9.name)

def texts():
Пример #29
0
cfd = nltk.ConditionalFreqDist(bigrams)
print(cfd['living'])
genrate_model(cfd, 'living')


#lexical resources - wordlist with info such as lexical resources, sense definition etc.
#unusual words
def unusual_words(text):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    unusual = text_vocab - english_vocab
    return sorted(unusual)


unusual_words(gutenberg.words('austen-sense.txt'))
unusual_words(nps_chat.words())
#stop words such as if the for etc.
print(stopwords.words('english'))


#function to compute what % of words are not is stopwords list
def content_fraction(text):
    stopwords_list = stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords_list]
    return len(content) / len(text) * 100


content_fraction(reuters.words())
#solving word puzzle
puzzle_letters = nltk.FreqDist('egivrvonl')
obligatory = 'r'
Пример #30
0
 'English: Brown Corpus':
 lambda: brown.words(),
 'English: Brown Corpus (Press)':
 lambda: brown.words(categories=['news', 'editorial', 'reviews']),
 'English: Brown Corpus (Religion)':
 lambda: brown.words(categories='religion'),
 'English: Brown Corpus (Learned)':
 lambda: brown.words(categories='learned'),
 'English: Brown Corpus (Science Fiction)':
 lambda: brown.words(categories='science_fiction'),
 'English: Brown Corpus (Romance)':
 lambda: brown.words(categories='romance'),
 'English: Brown Corpus (Humor)':
 lambda: brown.words(categories='humor'),
 'English: NPS Chat Corpus':
 lambda: nps_chat.words(),
 'English: Wall Street Journal Corpus':
 lambda: treebank.words(),
 'Chinese: Sinica Corpus':
 lambda: sinica_treebank.words(),
 'Dutch: Alpino Corpus':
 lambda: alpino.words(),
 'Hindi: Indian Languages Corpus':
 lambda: indian.words(files='hindi.pos'),
 'Portuguese: Floresta Corpus (Portugal)':
 lambda: floresta.words(),
 'Portuguese: MAC-MORPHO Corpus (Brazil)':
 lambda: mac_morpho.words(),
 'Portuguese: Machado Corpus (Brazil)':
 lambda: machado.words(),
 'Spanish: CESS-ESP Corpus':
Пример #31
0
#These libraries are for cleaning text data
#re is the Python library for Regular Expressions
import re

#nltk is the python library for Natural Language Processing (used here for cleaning non-English text from the data)
from nltk.corpus import brown
from nltk.corpus import words
from nltk.corpus import cess_esp as spanish
from nltk.corpus import reuters
from nltk.corpus import nps_chat

#These dictionaries are used to reduce time required to search for English words by implementing a hash search in "isEnglishWord"
englishBrownDict = dict.fromkeys(brown.words(), True)
englishWordsDict = dict.fromkeys(words.words(), True)
englishReutersDict = dict.fromkeys(reuters.words(), True)
englishChatDict = dict.fromkeys(nps_chat.words(), True)

spanishWordsDict = dict.fromkeys(spanish.words(), True)

malayText = open(os.path.join(os.getcwd(), "malayUpdated.txt"))
malayWordsDict = []

for line in malayText:

    malayWordsDict.append(line)

#print "Count of malay words: ", len (malayWords), "\n"
#malayWordsDict = dict.fromkeys (malayWords, True)

commonTweetWords = [
    "ur", "u", "youre", "gonna", "wanna", "wannabe", "shoulda", "should've",
Пример #32
0
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words("melville-moby_dick.txt"))
print("text1:", text1.name)

text2 = Text(gutenberg.words("austen-sense.txt"))
print("text2:", text2.name)

text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
print("text8:", text8.name)

text9 = Text(gutenberg.words("chesterton-thursday.txt"))
print("text9:", text9.name)

# global list of gold corpora
# C:\Users\admin\AppData\Roaming\nltk_data\corpora\
corp_names = [
    "brown", "nps_chat", "conll2000", "treebank", "twitter", "nhtsa_0",
    "nhtsa_1", "nhtsa_2", "nhtsa_3", "nhtsa_4", "nhtsa_5", "nhtsa_6"
]
corp_words_tagged = [
    brown.tagged_words(tagset=CONST_tagset),
    nps_chat.tagged_words(tagset=CONST_tagset),
    conll2000.tagged_words(tagset=CONST_tagset),
    treebank.tagged_words(tagset=CONST_tagset)
]
corp_words_untagged = [
    brown.words(),
    nps_chat.words(),
    conll2000.words(),
    treebank.words()
]
corp_sents_tagged = [
    brown.tagged_sents(tagset=CONST_tagset),
    nps_chat.tagged_posts(tagset=CONST_tagset),
    conll2000.tagged_sents(tagset=CONST_tagset),
    treebank.tagged_sents(tagset=CONST_tagset)
]
corp_sents_untagged = [
    brown.sents(),
    nps_chat.posts(),
    conll2000.sents(),
    treebank.sents()
]