Exemplo n.º 1
0
from nltk.book import text1

text1.concordance(
    "monstrous")  # Show a concordance view of a word with its context

text1.similar("monstrous")  # Show words that appear in similar context

text1.common_contexts(["monstrous", "very"
                       ])  # Examine the context shared by two or more words

print len(text1)  # Count number of words and punctuations

print len(set(text1))  # Print vocab size of the text

print text3.count("smote")  # Print num word occurence
Exemplo n.º 2
0
import nltk

from nltk import FreqDist
from nltk.book import text1

print("\nSearch for all occurances of the parameter in the text with context:")
text1.concordance("want")

print(
    "\nSearch for other words that appear in similar contexts to the parameter"
)
text1.similar("monstrous")

text1.common_contexts(["test", "try"])

#Is blocking to the script continuing
text1.dispersion_plot(["mean", "know"])

len(text1)
set(text1)
sorted(set(text1))
len(set(text1))

print("\nFind all words in a text meetine a predicate (word length)")
V = set(text1)
long_words = [w for w in V if len(w) > 15]
print(sorted(long_words))

print("\nFind words to categorise a text")
fdist1 = FreqDist(text1)
print(sorted(w for w in set(text1) if len(w) > 9 and fdist1[w] > 8))
Exemplo n.º 3
0
# Tokenize a simple text
sentence = "I want to book a flight from New York to Amsterdam"
tokens = nltk.word_tokenize(sentence)
print(tokens)
tagged_tokens = nltk.pos_tag(tokens)

for tokenname, tokentype in tagged_tokens:
    #print(token[0],':',token[1])
    print(tokenname, ':', tokentype)

print()
# Get all verbs of a specific text
textToEvaluate = "After a public divorce, Spider-Man's parents have patched up their differences.The studios behind " "Spider-Man" " have reconciled, with Disney and Sony agreeing to collaborate on a third movie featuring the teenage hero, after a very public split a little over a month ago that caused an uproar among fans.On Friday, the parties announced that Marvel would again have a hand in producing the next sequel, and that Spider-Man would appear in another upcoming Marvel feature.In a statement, Marvel Studios chief Kevin Feige said he is " "thrilled that Spidey's journey in the MCU will continue." ""
tokens = nltk.word_tokenize(textToEvaluate)
tagged_tokens = nltk.pos_tag(tokens)
print("All verbs of the text:")
for tokenname, tokentype in [
        tok for tok in tagged_tokens if tok[1].startswith("VB")
]:
    print(tokenname, ':', tokentype)

print()
# Use the sample texts of NLTK
DownloadDictionary("nps_chat")
DownloadDictionary("webtext")

from nltk.book import text1, text2

# Search large texts
text1.concordance("kiss")
Exemplo n.º 4
0
from nltk.book import text1
from nltk.book import text4
from nltk.book import text6

print(text1.concordance("monstrous"))
print(text1.similar("monstrous"))
print(text1.collocations())
text4.dispersion_plot(
    ["citizens", "democracy", "freedom", "duties", "America"])

print(text6.count("Very"))
print(text6.count('the') / float(len(text6)) * 100)
print(text4.count("bless"))
print(text4[100])
print(text4.index('the'))
print(text4[524])
print(text4.index('men'))
print(text4[0:len(text4)])
Exemplo n.º 5
0
#!env python

from nltk.book import text1

# Every occurrence with context
print text1.concordance("monstrous")

print text1.similar("monstrous")

# S5
Exemplo n.º 6
0
# If you only want one of them, say text1, do the obvious:
from nltk.book import text1

# text1, text2,...,text9 are nine different text object. More specifically, each object is of class nltk.text.Text
text1[1]    # returns the second (note that python indexes start with 0) word of the first book in the collection, Moby Dick. 
                # text1[1] is a string object
text1[1][2] # is the third letter of the second word in Moby Dick.

# You can of course define your own text (as a Python list object):
myText = ["This", "is", "my","text","and","there","is","nothing","you","can","do","about","it","!"]
myText[4]


## Searching Text ##
text1.concordance("monstrous")  # Searches for occurences of words and displays part of the lines where it appear 
                                # (i.e. it also gives the context in which the word appears) 

# How long is the book (words and other characters (tokens) like punctuation)?
len(text1)

# How long is the second word?
len(text1[1])

# Give me the set of distinct tokens!
set(text1)

# How many tokes among the first 100 words?
len(set(text1[0:99]))

# Python is nice because return arguments can be directly indexed:
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 24 22:28:26 2019

@author: Vipul
"""

import nltk
"""
# Only once
nltk.download()
"""

from nltk.book import text1

# Concordance views show us every occurence of a given word, together with some context
text1.concordance("crooked")
Exemplo n.º 8
0
print("hello world")
import nltk
nltk.download('book')
from nltk.book import text1
text1 = nltk.book.text1
text1.concordance('monstrous')  # busca concordancias en el text1
text1.similar('monstrous')
text2 = nltk.book.text2
text2.similar('monstrous')
text2.common_contexts(["monstrous", "very"])
text1.common_contexts(["monstrous", "whale"])
nltk.book.text4.dispersion_plot(
    ["citizens", "democracy", "freedom", "duties", "America"])
nltk.book.text3.generate()

# Counting Vocabulary
len(text1)
len(text2)

# Tokens (individual unit of text) and Vocabulary (distinct unit)
len(nltk.book.text3)  # count tokens
len(set(nltk.book.text3))  # vocabulary

#   lexical richnes of a text
len(set(text2)) / len(text2)


# Function
def lexical_diversity(text):
    return len(set(text)) / len(text)
#/Users/randou/Esther/Brandeis/2019 Fall/LING131A NLP/Exercises
# -*- coding: utf-8 -*-

import nltk
nltk.download()
from nltk.book import *
from nltk.book import text1
import pandas as pd

# =============================================================================
# 1.3 Searching Text
# =============================================================================

text1.concordance('monstrous')  #appearance of a word
text1.similar('monstrous')  #words used in the similar context
text1.common_contexts(['monstrous', 'mystifying'])
text1.dispersion_plot(['love', 'peace', 'luck', 'fortune'])
text1.generate()

# =============================================================================
# 1.4 Counting Vocabulary
# =============================================================================

len(text1)
len(sorted(set(text1)))
len(set(text1)) / len(text1)  # lexical richness
text1.count('love')


def lexical_diversity(text):
    return len(set(text)) / len(text)
Exemplo n.º 10
0
def searchFor( substring ):
	print '\nSearching for "%s":' % substring
	print text1.concordance( substring )
Exemplo n.º 11
0
##############################

import urllib.request
url = "http://www.cs.tufts.edu/comp/116/access.log"
accesslog =  urllib.request.urlopen(url).read().decode('utf-8')
print("accesslog: " + accesslog)


# import nltk

# nltk.download()

from nltk.book import text1
from nltk import FreqDist

text1.concordance("monstrous") # find all occurrences

text1.similar("monstrous")

text1.dispersion_plot(["citizens", "democracy"]) # location of words in text.

len(text1) # len in words / tokens.

sorted(set(text1))

len(set(text1)) / len(text1) # lexical richness.

text1.count("sun")

text1[122] # word 122 -> ignorance
Exemplo n.º 12
0
from nltk.book import text1


def lexical_diversity(text):
    return len(text) / len(set(text))


def percentage(count, total):
    return 100 * count / total


if __name__ == "__main__":
    text1.concordance(
        "monstrous")  # shows every occurence of a given word with context
    # text1.similar("monstrous")  # check for similar words in context
    # text1.concordance("contemptible")  # check for similar words in context
    text1.generate()
Exemplo n.º 13
0
print(food.member_holonyms())
print(food.part_holonyms())
print(food.substance_holonyms())

#6 In the discussion of comparative wordlists, we created an object called translate which you could look up using words in both German and Spanish in order to get corresponding words in English. What problem might arise with this approach? Can you suggest a way to avoid this problem?
translate = dict()
de2en = swadesh.entries(['de', 'en'])
es2en = swadesh.entries(['es', 'en'])
translate.update(dict(de2en))
translate.update(dict(es2en))
print(translate)
#one word could have multiple corresponding words or vice versa?
#keep only one in dictionary???

#7 According to Strunk and White's Elements of Style, the word however, used at the start of a sentence, means "in whatever way" or "to whatever extent", and not "nevertheless". They give this example of correct usage: However you advise him, he will probably do as he thinks best. (http://www.bartleby.com/141/strunk3.html) Use the concordance tool to study actual usage of this word in the various texts we have been considering. See also the LanguageLog posting "Fossilized prejudices about 'however'" at http://itre.cis.upenn.edu/~myl/languagelog/archives/001913.html
print(mobydick.concordance('however'))
print(sense_and_sensibility.concordance('however'))

#8 Define a conditional frequency distribution over the Names corpus that allows you to see which initial letters are more frequent for males vs. females (cf. 4.4).
#cfd against last letters for all names to check well known fact that names ending in letter a are almost always female
cfd = nltk.ConditionalFreqDist((fileid, name[1]) for fileid in names.fileids()
                               for name in names.words(fileid))
cfd.plot()

#9 Pick a pair of texts and study the differences between them, in terms of vocabulary, vocabulary richness, genre, etc. Can you find pairs of words which have quite different meanings across the two texts, such as monstrous in Moby Dick and in Sense and Sensibility?
#already have news and religion data from brown corpus
#concordance works on Text objects, so need to instantiate a Text with news and religion data
news_data = nltk.Text(news_data)
religion_data = nltk.Text(religion_data)
#trying to find common words
news_fd = nltk.FreqDist(news_data)
Exemplo n.º 14
0
#!env python

from nltk.book import text1


# Every occurrence with context
print text1.concordance("monstrous")

print text1.similar("monstrous")

# S5
Exemplo n.º 15
0
from collections import Counter
import nltk
from nltk.book import text1, text2

file = open('manifesto.txt', 'rU')
raw_manifesto = file.read().decode('utf-8')

tokens = nltk.word_tokenize(raw_manifesto)
text = nltk.Text(tokens)

print Counter(tokens)
distribution = nltk.FreqDist(text)
distribution.most_common(50)
distribution.plot()

text1.concordance("galactic")  # No galactic whales :'(

print "\n\n"
text1.similar("monstrous")

print "\n\n"
text2.similar("monstrous")
Exemplo n.º 16
0
# import init
from nltk.book import (text1, text2, text3, text4)

# searching text
text1.concordance("monstrous")
text2.common_contexts(["monstrous", "very"])
text3.concordance("lived")
text4.similar("monstrous")

arr = ["citizens", "democracy", "freedom", "duties", "America"]
# text4.dispersion_plot(arr)
Exemplo n.º 17
0
from nltk.book import text1

print(text1.concordance("man"))
Exemplo n.º 18
0
#coding:utf-8
import nltk as nk
from nltk.book import text1 as t1
from nltk.book import text4 as t4
print '================================='
'''
下载测试数据
'''
# nltk.download()

print '===============查找关键词=================='
t1.concordance("america")

print '===============查找相似上下文==============='
t1.similar("america")

print '=============共同的语法结构================='
t1.common_contexts(['in', 'of'])

print '=================词汇分布图================='
t4.dispersion_plot(['citizens', 'democaracy', 'freedom', 'america'])

print '=================统计最常出现的词================'
freList = nk.FreqDist(t1)
freList.plot(50, cumulative=False)

print '=================统计长度超过15的词==============='
v = set(t1)
long_words = filter(lambda x: len(x) > 15, v)[:10]
print long_words
Exemplo n.º 19
0
# text1, text2,...,text9 are nine different text object. More specifically, each object is of class nltk.text.Text
text1[
    1]  # returns the second (note that python indexes start with 0) word of the first book in the collection, Moby Dick.
# text1[1] is a string object
text1[1][2]  # is the third letter of the second word in Moby Dick.

# You can of course define your own text (as a Python list object):
myText = [
    "This", "is", "my", "text", "and", "there", "is", "nothing", "you", "can",
    "do", "about", "it", "!"
]
myText[4]

## Searching Text ##
text1.concordance(
    "monstrous"
)  # Searches for occurences of words and displays part of the lines where it appear
# (i.e. it also gives the context in which the word appears)

#text1.similar("whatever") # Lists the words that are similar to some word (similar in which sense?)

#text2.common_contexts(["monstrous","very"])

# Dispersion plot - To find out WHERE a word appears (you need the module pylab which ships with the Spyder IDE)
text4.dispersion_plot(
    ["citizens", "democracy", "freedom", "duties", "America"])

# How long is the book (words and other characters (tokens) like punctuation)?
len(text1)

# How long is the second word?
Exemplo n.º 20
0
# https://www.nltk.org/book/ch01.html
import string
import nltk
from nltk.tokenize import word_tokenize

nltk.download('book')
nltk.download('punkt')

from nltk.book import text1, text2, text3, text4, text5, text6, text7, text8, text9

text1
text1.concordance('monstrous')
len(text1)

comment = 'With the Senate trial of Mr. Trump now underway, we deployed a team of journalists to find out. We contacted hundreds of voters who had responded to an online survey saying they would be willing to be interviewed. We reached 81 people, from nearly 30 states. They were Democrats, Republicans and independents. They were retirees and real estate agents, teachers and stay-at-home parents. The youngest was 21; the oldest was 82. Even before the opening statements at the trial had begun, most had already made up their minds on their preferred verdict. As one independent voter from Ohio put it, “maybe they should ask the people what they should do. It should be our vote.”'


def count_words(s):
    s = str(s).lower()
    s = s.translate(str.maketrans('', '', string.punctuation))
    return len(word_tokenize(s))


count_words(comment)

# Average number of words