/
Day 2 (First ).py
260 lines (195 loc) · 7.04 KB
/
Day 2 (First ).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
text = """ Dostoevsky,was the son of the doctor…..."""
#List The vocabulary
vocab = sorted(set(text.lower().split()))
vocab
#Count the vocabulary
len(vocab)
#Count the occuranece
text.count('the')
#Filter Words
text.split(".")
text.split()
#Example
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
#Encode the data
encoding = response.info().get_param('charset', 'utf8')
text1 = response.read().decode(encoding)
text1
# Count total no of words
vocab1 = sorted(set(text1.lower().split()))
len(vocab1)
# Tokenize special words
from nltk.tokenize import sent_tokenize, word_tokenize
print(len(sent_tokenize(text1)))
print(len(word_tokenize(text1)))
#Tokenzing
from nltk.tokenize import sent_tokenize, word_tokenize
EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."
print(sent_tokenize(EXAMPLE_TEXT))
print(word_tokenize(EXAMPLE_TEXT))
#Or the word tokenize can be usedlike this
for i in word_tokenize(EXAMPLE_TEXT):
print(i)
#nltk import
#pip install nltk
import nltk
nltk.download('all')
#STOP WORDS
from nltk.corpus import stopwords
#list the default stopwords
set(stopwords.words('english'))
#Example
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
example_sent = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english'))
print(stop_words)
word_tokens = word_tokenize(example_sent)
print(word_tokens)
#Create a filtered sentence
filtered_sentence=[]
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
# or the above code can be replaced as
# filtered_sentence = [w for w in word_tokens if not w in stop_words]
print(filtered_sentence)
# Remove Punctuation from text\
from string import punctuation
def strip_punctuation(s):
return ''.join(c for c in s if c not in punctuation)
text = "This is a sample sentence, showing off the stop words filtration."
print (strip_punctuation(text))
# Remove numbers from text
text = "There was 200 people standing right next to me at 2pm."
output = ''.join(c for c in text if not c.isdigit())
print(output)
#Remove Html tags in a text
import re
text = """<head><body>hello world!</body></head>"""
cleaned_text = re.sub('<[^<]+?>','', text)
print (cleaned_text)
#Regular Expression for detecting Word Patterns
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]#List the words in the English Dictionary words
[w for w in wordlist if re.search('ed$', w)]#List the words in the wordlist ending with ed
#The . wildcard symbol matches any single character. Suppose we have room in a crossword puzzle for an 8-letter word with j as its third letter and t as its sixth letter. In place of each blank cell we use a period:
[w for w in wordlist if re.search('^..j..t..$', w)]
[re.match('^[a-z]+$',w)]
#Stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
ps = PorterStemmer()
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
for w in example_words:
print(ps.stem(w))
#Example stemming
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words=word_tokenize(new_text)
for w in words:
print(ps.stem(w))
#Lemmatize
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations="?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
if word in punctuations:
sentence_words.remove(word)
sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word,pos="v")))
#POS
nltk.pos_tag(['cat','cats'])
nltk.pos_tag(['take','took','taking','taken'])
nltk.pos_tag(['delicious'])
nltk.pos_tag(['slowly'])
text = word_tokenize("And now for something completely different")
nltk.pos_tag(text)
#EXAMPLE
from urllib import request
url = "https://en.wikipedia.org/wiki/George_Washington"
response = request.urlopen(url)
#Encode the data
encoding = response.info().get_param('charset', 'utf8')
text1 = response.read().decode(encoding)
text1
from nltk.tokenize import sent_tokenize, word_tokenize
print(len(sent_tokenize(text1)))
print(len(word_tokenize(text1)))
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
#is based on The Porter Stemming Algorithm
stop_words = set(stopwords.words('english'))
text2=strip_punctuation(text1)
wordnet_lemmatizer = WordNetLemmatizer()
word_tokens = nltk.word_tokenize(text2)
word_tokens1=''.join(c for c in word_tokens if not c.isdigit())
print(word_tokens1)
lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]
print (lemmatized_word)
nltk.pos_tag(text2)
#brown
#In the rest of this chapter we will explore various ways to automatically add part-of-speech tags to text. We will see that the tag of a word depends on the word and its context within a sentence. For this reason, we will be working with data at the level of (tagged) sentences rather than words. We'll begin by loading the data we will be using.
from nltk.corpus import brown
brown_sents = brown.sents(categories='news')
brown_sents
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_tagged_sents
#MODULE 3
from nltk.corpus import gutenberg
import nltk
nltk.download('gutenberg')
import nltk
nltk.download('punkt')
fileid = 'austen-emma.txt'
text = gutenberg.raw(fileid)
#Fileid :
gutenberg.fileids()
#Text :
gutenberg.raw(fileid)
#Words :
gutenberg.words(fileid)
#Sentence :
gutenberg.sents(fileid)
from nltk.tokenize import sent_tokenize
tok = sent_tokenize(text)
for x in range(5):
print(tok[x])
from nltk.corpus import brown
brown.categories()
from nltk.corpus import brown
brown.categories()
text = brown.raw(categories='news')
from nltk.corpus import reuters
reuters.fileids()
reuters.categories()
text = reuters.raw(fileid)
reuters.categories(fileid)
from nltk.corpus import movie_reviews
movie_reviews.fileids()
movie_reviews.categories()
text = movie_reviews.raw(fileid)
movie_reviews.categories(fileid)
#Frequency distribution by creating our own corpus
from nltk.corpus import PlaintextCorpusReader
fileid = 'C:/Users/arun/Desktop/ITRAIN/itrain python/Advanced/codes/gaming.txt'
my_corpus = PlaintextCorpusReader(fileid, '.*')
text = my_corpus.raw(fileid)
text
my_corpus.raw(fileid)
my_corpus.words(fileid)
my_corpus.sents(fileid)
distr = nltk.FreqDist(text)
print(distr.most_common(5))
#Reuters
from nltk.corpus import reuters
fileid='training/9865'
text=reuters.raw(fileid)
text