示例#1
0
def run():

    # Dataset generation
    cls = ['Obama', 'Trump']
    obama_sentences = inaugural.sents('2009-Obama.txt')
    trump_sentences = inaugural.sents('2017-Trump.txt')
    labelled_obama = [(s, cls[0]) for s in obama_sentences]
    labelled_trump = [(s, cls[1]) for s in trump_sentences]
    labelled_data = labelled_obama + labelled_trump

    trump_test = [
        'We', ',', 'the', 'citizens', 'of', 'America', ',', 'are', 'now',
        'joined', 'in', 'a', 'great', 'national', 'effort', 'to', 'rebuild',
        'our', 'country', 'and', 'restore', 'its', 'promise', 'for', 'all',
        'of', 'our', 'people', '.'
    ]
    obama_test = [
        'I', 'stand', 'here', 'today', 'humbled', 'by', 'the', 'task',
        'before', 'us', ',', 'grateful', 'for', 'the', 'trust', 'you', 'have',
        'bestowed', ',', 'mindful', 'of', 'the', 'sacrifices', 'borne', 'by',
        'our', 'ancestors', '.'
    ]

    model = Model(labelled_data, cls)
    model.train()

    while True:
        inp = input("Input a string to test: ")
        doc = tokenize(inp)
        print(model.test_doc(doc))
示例#2
0
def sent_length():

    text_file = str(input("Enter the name of a text file : \n"))
    txt_fl = inaugural.sents(text_file)
    print(len(txt_fl))
    file_name = inaugural.fileids()
    print(len(inaugural.sents(file_name)))
def avgSent():
    x2 = []
    y2 = []

    for fileid in inaugural.fileids():
        average = sum(len(sent)
                      for sent in inaugural.sents(fileids=[fileid])) / len(
                          inaugural.sents(fileids=[fileid]))
        print(fileid[:4], "-", average)
        y2.append(fileid[:4])
        x2.append(average)

    plt.title('Średnia długość zdania:')
    plt.xticks(rotation=90)
    plt.plot(y2, x2)
    plt.show()
示例#4
0
def get_sentences():
    '''获得语料库中的句子,输出成sectence'''
    '''需要调用这个函数'''
    articles = inaugural.fileids()
    sentences = []
    for i in articles:
        article = inaugural.sents(i)
        sentences = sentences + list(article)
    return sentences
def get_inaugural_docs(download=False) -> List[List[List[str]]]:
    """
    Get the inaugural documents as a list (documents) of list (sentences) of list (sentence) of strings (words)
    :param download: If True, the corpus will be downloaded. Default=False
    :return:
    """
    if download:
        nltk.download('inaugural')
    return [[[w.lower() for w in sent] for sent in inaugural.sents(fileid)]
            for fileid in inaugural.fileids()]
def main():
    # @BEGIN normalize_list
    # @IN inaugural @URI file:data/inaugural/{year}-{president}.txt
    # @OUT normalized_addresses 
    file_ids = inaugural.fileids()
    print(file_ids)
    normalized_addresses = []
    for address in file_ids:
        normalized_words = [address.split("-")[0]]
        for sent in inaugural.sents(address):
            prev_word = ""
            for word in sent:
                if(prev_word == "'"):
                    continue
                
                normalized = re.sub("[^a-z0-9]", "", word.lower())
                if(normalized != ""):
                    normalized_words.append(normalized)
                prev_word = word
        normalized_addresses.append(normalized_words)
    # @END normalized_list

    # @BEGIN pickleize
    # @IN normalized_addresses
    # @OUT pkl @URI file:data/norm_addresses.pkl
    fout = open("norm_addresses.pkl", "wb")
    pickle.dump(normalized_addresses, fout)
    fout.close()
    # @END pickleize

    # deserialize pkl file
    # @BEGIN depickleize
    # @IN pkl @URI file:data/norm_addresses.pkl
    # @OUT address_word_list
    fin = open("norm_addresses.pkl", "rb")
    address_word_list = pickle.load(fin)
    fin.close()
    # @END depickleize

    # @BEGIN frequency
    # @IN address_word_list
    # @IN search_word
    # @OUT frequency_maps
    search_word = input("Input word to find frequency: ")


    frequency_maps = {}
    for word_list in address_word_list:
        
        frequency_maps[word_list[0]] = calculate_frequency_map(word_list[1:])
    # @END frequency
    

    generate_plot(search_word, frequency_maps)
示例#7
0
def read_address(address):
    '''
	Reads the given nltk inaugural address to a string
	'''
    full_address = ""

    #join all the words in each  sentence
    for sent in inaugural.sents(address):
        sent = ' '.join(sent)
        full_address = full_address + sent + '\n'

    return full_address
示例#8
0
def write_files():
    import nltk
    import re
    import pickle
    files = nltk.corpus.inaugural.fileids()
    from nltk.corpus import inaugural

    masterList = list()
    for i in files:
        sentences = inaugural.sents(i)
        sentLst = [' '.join(sent) + '\n' for sent in sentences]
        theString = str(sentLst)
        fixedString = tokenize(theString)
        masterList.append(fixedString)

    fout = open('proj3.pkl', 'wb')
    pickle.dump(masterList, fout)
    fout.close()
示例#9
0
def get_default_sentences() -> list:
    nltk.download('brown')
    brown_tokenized_sentences = brown.sents()
    brown_sentences = detok_sentences(brown_tokenized_sentences)
    nltk.download('gutenberg')
    nltk.download('punkt')
    gutenberg_tokenized_sentences = gutenberg.sents()
    gutenberg_sentences = detok_sentences(gutenberg_tokenized_sentences)
    nltk.download('reuters')
    reuters_tokenized_sentences = reuters.sents()
    reuters_sentences = detok_sentences(reuters_tokenized_sentences)
    nltk.download('webtext')
    webtext_tokenized_sentences = webtext.sents()
    webtext_sentences = detok_sentences(webtext_tokenized_sentences)
    nltk.download('inaugural')
    inaugural_tokenized_sentences = inaugural.sents()
    inaugural_sentences = detok_sentences(inaugural_tokenized_sentences)
    return brown_sentences + gutenberg_sentences + reuters_sentences + webtext_sentences + inaugural_sentences
def graphWords():
    index = 0
    for id in inaugural.fileids():
        index += 1
        nchar = len(inaugural.raw(id)) * 1.0
        nword = len(inaugural.words(id)) * 1.0
        nsent = len(inaugural.sents(id)) * 1.0
        nvoc = len(set(w.lower() for w in inaugural.words(id))) * 1.0
        a = nchar / nword
        b = nword / nsent
        c = nword / nvoc
        plot(index, a, 'mo')  #purple color
        plot(index, b, 'go')  #green color
        plot(index, c, 'ro')  #red color

        xlabel(
            'index, from Washington to Obama (purple - character/word), (red - word/vocab)'
        )
        ylabel('Average numbers (green - word/sentence)')
    show()
示例#11
0
def main():
    synset_group = []
    word_group = []
    president_Synset_usage = []
    f = open("keyword_group.txt", 'r')
    while True:
        synset = f.readline()
        if not synset: break
        synset = synset.strip()
        if synset == 'group':
            synset_group.append(set())
        else:
            synset_group[-1].add(synset)
    f.close()

    group_size = len(synset_group)
    for g in synset_group:
        word_group.append(syn_to_lem(g))

    for fileid in inaugural.fileids():
        corpus = clean_corpus(inaugural.sents(fileid))
        total_len = sum(map(len, corpus))

        Group_usage = np.zeros((1, group_size))
        for sent in corpus:
            for i in range(group_size):
                keywords = word_group[i]
                check = set(filter(lambda x: x in sent, keywords))
                Group_usage[0][i] += len(check)

        Group_usage /= total_len

        president_Synset_usage.append((fileid[:-4], Group_usage))

    #for f in president_Synset_usage:   print(f)

    learnable = []
    for f in president_Synset_usage[-15:]:
        learnable.append(f[1])

    data = concat_all(learnable)
    label = np.array([[
        4.65, 5.05, 2.86, 2.58, 3.24, 3.14, 3.82, 2.25, 3.31, 4.45, 2.35, 2.03,
        1.46, 2.19, 2.48
    ]])
    label = label.T
    #best_lambda(3,data,label)
    train_acc = []
    l_list = []

    idx = np.arange(group_size)
    np.random.seed(6)
    np.random.shuffle(idx)
    for i in range(group_size):
        dt = data.T
        #dt = dt[idx[:i+1]]
        dt = dt[:i + 1]
        #train, test = k_fold(3,dt.T,label)
        train, test, l = best_lambda(3, dt.T, label)
        train_acc.append(train)
        l_list.append(l)

    print('\nl: ', l_list)
    print('\ntrain: ', train_acc)

    x_range = np.arange(group_size)
    plt.plot(x_range, train_acc, c='k')
    plt.xlabel('number of seed words')
    plt.ylabel('R2 loss')
    plt.show()
示例#12
0
# === Part 1: Importing Corpuses ===

import nltk
from nltk.corpus import inaugural
print inaugural.fileids()

#Run your file.You should see all the text files containing all the speeches of the US presidents that the
#NLTK has saved inside it.
#Now add the lines:

print "=============Words in Obama's Speech ======"
print inaugural.words(
    '2009-Obama.txt')  #Returns a list of all the words in Obama's speech
print "=============Words in Bush's speech ======"
print inaugural.sents(
    '2005-Bush.txt')  #Returns a list of all the sentences in Bush's speech

#As you can see, the words of Obamas speech are printed in a list, as are the sentences of Bush's speech.

#Try add code to your program to find and outprint the first 25 words of Obama's 2009 speech.

# ===  Part 2: Analysing tokens (words) of a text ===

#The term 'token' means a word or a punctuation mark.
#After you've done that, add the following lines to your program

from nltk.book import *

#This may take a while to load. NLTK has many texts stored in it!
#Once its loaded type:
示例#13
0
"""
Train a Word2Vec model, and print the most similar words to "war"
"""
import warnings
warnings.filterwarnings("ignore")
from typing import List, Dict, Tuple
import hashlib
import nltk
from nltk.corpus import inaugural
from gensim.models import Word2Vec

# Pass this hashfxn to gensim's Word2Vec.
def hashfxn(s) -> int:
    return int(hashlib.sha256(s.encode('utf-8')).hexdigest(), 16) % 10 ** 8

sentences = inaugural.sents()
model = Word2Vec(sentences=sentences, size=100, workers=1, hashfxn=hashfxn, iter=10)

print("5 most similar words to war:")
print(model.most_similar("war", topn=5))
# Print some more
示例#14
0
#!/usr/bin/env python
# coding: utf-8

# In[4]:

from nltk.corpus import inaugural

# In[5]:

inaugural.words('1789-Washington.txt')

# In[6]:

inaugural.raw('1789-Washington.txt')

# In[7]:

inaugural.sents('1789-Washington.txt')

# In[8]:

inaugural.paras('1789-Washington.txt')

# In[1]:

from nltk.corpus import wordnet as wn
wn.synsets('dog')

# In[ ]:
示例#15
0
# print(list(ngrams(random_text, 5)))
# print(list(ngrams(random_words, 5)))
# Google n-gram viewer
# random_sentence = inaugural.sents('2009-Obama.txt')[1]
# for trg in ngrams(random_sentence, 3):
#     print(trg)
# print(list(bigrams(random_sentence, pad_right=True)))
# for trg in (ngrams(random_sentence, 4,
#                    pad_right=True, right_pad_symbol='</s>',
#                    pad_left=True, left_pad_symbol='<s>')):
#     print(trg)
target_speeches = ['1789-Washington.txt', '1861-Lincoln.txt', '2001-Bush.txt']
ngr = {}
# tokens
for text in target_speeches:
    data = inaugural.words(text)
    for trg in (ngrams(data, 2)):
        if trg not in ngr:
            ngr[trg] = 1
        else:
            ngr[trg] += 1
ngr = sorted(ngr.items(), key=lambda kv: kv[1], reverse=True)
# sentences
bigr_of_text = []
for text in target_speeches:
    data = inaugural.sents(text)
    bigr_of_text += (ngrams(data, 2))

print(bigr_of_text)
# a = FreqDist(bigr_of_text)
# print(a.most_common(5))
示例#16
0
import pprint

# initialize pprint
pp = pprint.PrettyPrinter(indent=4)

# list of all speeches
ids = inaugural.fileids()

data = '1789-Washington.txt'

# get speech of particular file
speech = inaugural.raw(data)
speech = speech.lower()

# get sentences
sentences = inaugural.sents(data)

# sentence tokenize
sent_tokens = sent_tokenize(speech)

# print sentence
#pp.pprint(sent_tokens)

# word tokenize
word_tokens = word_tokenize(speech)

# print words
#pp.pprint(word_tokens)

stop_words = set(stopwords.words('english'))
#pp.pprint(stop_words)
示例#17
0
book_sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        book_sentences.append(sentence_to_wordlist(raw_sentence))

#print(raw_sentences[5])
#print(book_sentences[5])

conll2000_corp_sents = conll2000.sents()
print("condll2000 to sents")
conll2002_corp_sents = conll2002.sents()
print("conll2002 to sents")

conll2007_corp_sents = conll2007.sents()
print("condll2007 to sents")
inaugural_corp_sents = inaugural.sents()
print("inaugural to sents")
abc_corp_sents = abc.sents()
print("ABC to sentences")
genesis_corp_sents = genesis.sents()
print("Genesis to sents")
frame_net_corp_sents = fn.sents()
print("Frame_net to sents")
state_union_corp_sents = state_union.sents()
print('state union to sents')
subject_corp_sents = subjectivity.sents()
print('Subjectvity to sents')
brown_corp_sents = brown.sents()
print("Brown corpus to sents")
movie_reviews_corp_sents = movie_reviews.sents()
print("Movie reviews to sents ")
示例#18
0
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE

from nltk.lm import Lidstone
from nltk.lm import Laplace
from nltk.lm import KneserNeyInterpolated

# Exercise 1

president_unigrams = {}

for president in inaugural.fileids():
    text_unigrams = [ngrams(sent, 1) for sent in inaugural.sents(president)]
    ngram_counts = NgramCounter(text_unigrams)
    president_unigrams[president] = ngram_counts.N()

inverse_unigrams = [(value, key) for key, value in president_unigrams.items()]
print(max(inverse_unigrams)[1],
      max(inverse_unigrams)[0])  #longest discourse for Harrison in 1841
print(min(inverse_unigrams)[1],
      min(inverse_unigrams)[0])  #shortest discourse for Washington in 1793

president_vocabulary = {}

for president in inaugural.fileids():
    vocab = Vocabulary(inaugural.words(president), unk_cutoff=2)
    president_vocabulary[president] = len(vocab)
示例#19
0
from nltk.util import bigrams, trigrams, ngrams
from nltk.corpus import inaugural
from nltk import FreqDist

speech_wash = list(inaugural.sents('1789-Washington.txt'))
speech_adams = list(inaugural.sents('1797-Adams.txt'))
speech_lincoln = list(inaugural.sents('1861-Lincoln.txt'))

wash = []
for i in speech_wash:
    wash_b = list(
        bigrams(i,
                pad_right=True,
                right_pad_symbol='</s>',
                pad_left=True,
                left_pad_symbol='<s>'))
    wash.extend(wash_b)

adams = []
for a in speech_adams:
    adams_b = list(
        bigrams(a,
                pad_right=True,
                right_pad_symbol='</s>',
                pad_left=True,
                left_pad_symbol='<s>'))
    adams.extend(adams_b)

lincoln = []
for l in speech_lincoln:
    lincoln_b = list(
示例#20
0
print("\nPart 3:")
fd = FreqDist(text1)
print("Amount of times 'the' appears in text 1: \n" + str(fd['the']))
print(fd.keys())
print(fd.items())
print("End Part 3\n")

# === Part 4: Your task ===#
print("\nPart 4:")
# 1.) Returns the 10 most frequent words in Obama's 2009 inaugural speech, including their frequencies
fd = FreqDist(inaugural.words('2009-Obama.txt'))
print(fd.most_common(10))
# 2.) Calculates the Lexical Richness of his speech
print(len('2009-Obama.txt') / len(set('2009-Obama.txt')))
# 3.) Calculate the average length of the sentences in his speech
print(len(inaugural.sents('2009-Obama.txt')))
print("\nOptional Part\n")


# 4.) Write a separate function called sent_length that takes in a string for the name of a text like '2009-Obama.txt', then finds the average
# length of the sentences in that speech. Compute the average length of sentences from the year of the first speech (1789) to the year of
# Obama's 2009 speech and see how the average length of sentences has changed over the time of the entire US's history. Remember that
# inaugural.fileids() will give you a list of all the String names of the speeches so you don't need to work out each name of each speech.
def sent_length():
    text_file = str(input("Enter the name of a text file : \n"))
    txt_fl = inaugural.sents(text_file)
    print(len(txt_fl))
    file_name = inaugural.fileids()
    print(len(inaugural.sents(file_name)))

示例#21
0
#Each corpus is accessed by means of a "corpus reader" object from nltk.corpus
print(str(nltk.corpus.brown).replace('\\\\', '/'))
# The Penn Treebank Corpus:
print(str(nltk.corpus.treebank).replace('\\\\', '/'))
# The Name Genders Corpus:
print(str(nltk.corpus.names).replace('\\\\', '/'))
# The Inaugural Address Corpus:
print(str(nltk.corpus.inaugural).replace('\\\\', '/'))
print(str(nltk.corpus.treebank.fileids()))  # doctest: +ELLIPSIS
#print(str(nltk.corpus.inaugural.fileids()) # doctest: +ELLIPSIS
# Each corpus reader provides a variety of methods to read data from the corpus, depending on the format of the corpus.

from nltk.corpus import inaugural
print(inaugural.raw('1789-Washington.txt'))  # doctest: +ELLIPSIS
print(inaugural.words('1789-Washington.txt'))
print(inaugural.sents('1789-Washington.txt'))  # doctest: +ELLIPSIS
print(inaugural.paras(
    '1789-Washington.txt'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE

#

l1 = len(inaugural.words('1789-Washington.txt'))
l2 = len(inaugural.words('1793-Washington.txt'))
l3 = len(inaugural.words(['1789-Washington.txt', '1793-Washington.txt']))
print('%s+%s == %s' % (l1, l2, l3))

print(len(inaugural.words()))

print(inaugural.readme())
示例#22
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# import nltk
# nltk.download('inaugural')
import os


from nltk.corpus import inaugural


corpus_from_paragraphs = inaugural.paras(os.path.dirname(__file__) + '/dataset/paragraphs.txt')
corpus_from_sentences = inaugural.sents(os.path.dirname(__file__) + '/dataset/sentences.txt')
corpus_from_words = inaugural.words(os.path.dirname(__file__) + '/dataset/words.txt')

l1 = len(corpus_from_paragraphs)
l2 = len(corpus_from_sentences)
l3 = len(corpus_from_words)
# l2 = 0
# l3 = 0
print('paragraphs: %s, sentences: %s, words: %s' % (l1, l2, l3))

# print(inaugural.readme())
示例#23
0
        compared_word = list(vec_dict.keys())[arg] 
        if compared_word != word:
            print(compared_word, ' ' * (14-len(compared_word)) , pairwise_sims[0][arg])
    print()

def random_similarwords(examples=10):
    for _ in range(examples):
        word = choice(list(vec_dict.keys()))
        mostsimilar(word)

if __name__ == "__main__":
    # Get vocabulary & ngrams
    window_size = 5
    stopwords = stopwords.words('english')
    lmtzer = WordNetLemmatizer()
    filtered_sents = [tuple(lmtzer.lemmatize(word.lower()) \
                    for word in sent if word.isalnum() and not word in stopwords) \
                    for sent in inaugural.sents()]
    n_grams = get_ngrams(filtered_sents)
    vocab = set(word for sent in filtered_sents for word in sent)
    vec_dict = get_word_vecs()

    mostsimilar('demoralizes')
    print(for i in vec_dict['demoralizes'])
    for sent in filtered_sents:
        if 'demoralizes' in sent:
            print(sent)
    # random_similarwords(examples=10)


示例#24
0
    word_count_total = len(inaugural.words(speech))
    print(speech , word_count_total)
    
#Go through all speech     
speech_length = [(len(inaugural.words(speech)), speech)for speech in inaugural.fileids()]

print(speech_length)

#Get the max and min speech
print("Max is : ",max(speech_length))
print("Min is : ",min(speech_length))

#Avg no of words per sentence for each speech
for speech in inaugural.fileids():
    word_total = len(inaugural.words(speech))
    Sents_total = len(inaugural.sents(speech))
    print((word_total/Sents_total),speech)
    

#Creating a Data Frame of the Speech
data = pd.DataFrame([int(speech[:4]), len(inaugural.words(speech))/len(inaugural.sents(speech))] for speech in inaugural.fileids())
print(data.head())

#Adding Column Names
data.columns = ["Year","Avg WPS"]

print(data)

#Use Matplotlib
data.plot("Year", figsize=(15,5))
示例#25
0
# === Part 1: Importing Corpuses ===

import nltk
from nltk.corpus import inaugural

print inaugural.fileids()

# Run your file.You should see all the text files containing all the speeches of the US presidents that the
# NLTK has saved inside it.
# Now add the lines:

print "=============Words in Obama's Speech ======"
print inaugural.words("2009-Obama.txt")  # Returns a list of all the words in Obama's speech
print "=============Words in Bush's speech ======"
print inaugural.sents("2005-Bush.txt")  # Returns a list of all the sentences in Bush's speech

# As you can see, the words of Obamas speech are printed in a list, as are the sentences of Bush's speech.

# Try add code to your program to find and outprint the first 25 words of Obama's 2009 speech.

# ===  Part 2: Analysing tokens (words) of a text ===

# The term 'token' means a word or a punctuation mark.
# After you've done that, add the following lines to your program

from nltk.book import *

# This may take a while to load. NLTK has many texts stored in it!
# Once its loaded type:
示例#26
0
    total_word = len(inaugural.words(speech))
    print(str(total_word) + " Title: " + speech)

# if you find that output as list. I generally used list comprehension

speech_len = [(len(inaugural.words()), speech)
              for speech in inaugural.fileids()]
print(speech_len)
print(max(speech_len))
print(min(speech_len))

# Find out the average no of words per sentence

for speech in inaugural.fileids():
    words_total = len(inaugural.words(speech))
    sents_total = len(inaugural.sents(speech))
    avg_word_per_sents = words_total / sents_total
    print(avg_word_per_sents, speech)

# Best way to show this information as plot or visualize .
# Making the data frame using the python

data = pd.DataFrame([
    int(speech[:4]),
    len(inaugural.words(speech)) / len(inaugural.sents(speech))
] for speech in inaugural.fileids())
data.columns = ["Year", "Average WPS"]
print(data.head(10))

plt.interactive(False)
data.plot("Year", figsize=(15, 5))
        sentences.append(' '.join(tok_sent).strip())
    return sentences


print("Loading sentences.")
nltk.download('brown')
brown_tokenized_sentences = brown.sents()
brown_sentences = detok_sentences(brown_tokenized_sentences)
nltk.download('gutenberg')
nltk.download('punkt')
gutenberg_tokenized_sentences = gutenberg.sents()
gutenberg_sentences = detok_sentences(gutenberg_tokenized_sentences)
nltk.download('reuters')
reuters_tokenized_sentences = reuters.sents()
reuters_sentences = detok_sentences(reuters_tokenized_sentences)
nltk.download('webtext')
webtext_tokenized_sentences = webtext.sents()
webtext_sentences = detok_sentences(webtext_tokenized_sentences)
nltk.download('inaugural')
inaugural_tokenized_sentences = inaugural.sents()
inaugural_sentences = detok_sentences(inaugural_tokenized_sentences)
all_sentences = brown_sentences + gutenberg_sentences + reuters_sentences + webtext_sentences + inaugural_sentences

outfile = codecs.open('output.txt', 'w')
for sentence in all_sentences:
    cleaned_sentence = sentence.replace(" ' s ", "'s ")
    cleaned_sentence = cleaned_sentence.replace("n ' t ", "n't ")
    cleaned_sentence = cleaned_sentence.replace(" ,", ",")
    cleaned_sentence = cleaned_sentence.replace(" .", ".")
    outfile.write('{}\n'.format(cleaned_sentence))