-
Notifications
You must be signed in to change notification settings - Fork 1
/
_en-test.py
326 lines (258 loc) · 13 KB
/
_en-test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
#! /usr/bin/python2
# -*- coding: utf-8 -*-
import en
# This file just runs some tests to see if en is working.
# To run it, cd to the directory just above en, then
# python2 < _en-test.py
# LEXICAL CATEGORIZATION ############################################################
# Returns True when the given value is a number.
print(1, en.is_number(12))
print(2, en.is_number("twelve"))
# Returns True when the given string is a noun.
# You can also check for is_verb(), is_adjective() and is_adverb().
print(3, en.is_noun("banana"))
# Returns True when the given string is a tag,
# for example HTML or XML.
print(4, en.is_tag("</a>"))
# Return True when the string is a HTML tag,
# for example <a> or <body>.
print(5, en.is_html_tag("</person>"))
# COMMONSENSE #######################################################################
# Returns True if the given word expresses a basic emotion:
# anger, disgust, fear, joy, sadness, surprise.
print(6, en.is_basic_emotion("cheerful"))
# Returns True if the given word is a magic word:
# you, money, save, new, results, health, easy, ...
print(7, en.is_persuasive("money"))
# Returns True if the word is a connective:
# nevertheless, whatever, secondly, ...
# and words like I, the, own, him which have little semantical value.
print(8, en.is_connective("but"))
# NUMBERS ###########################################################################
# Returns the ordinal of the given number,
# 100 -> 100th, 3 -> 3rd
# twenty-one -> twenty-first
print(9, en.number.ordinal(100))
print(10, en.number.ordinal("twenty-one"))
# Writes out the given number:
# 25 -> twenty-five
print(11, en.number.spoken(25))
# QUANTIFICATION ####################################################################
# Quantifies the given word:
# 10 and chickens -> a number of chickens
# 300 and chickens -> hundreds of chickens
print(12, en.number.quantify(800, "chicken"))
# Quantifies a list of words:
# several chickens, a pair of geese and a duck
# Notice how goose is correctly pluralized and duck has the right article.
print(13, en.list.conjunction(["goose", "goose", "duck", "chicken", "chicken", "chicken"]))
# Quantifies the types of things in the given list:
# several integers
print(14, en.list.conjunction((1,2,3,4,5), generalize=True))
# You can also quantify a library:
# en.list.conjunction(en, generalize=True) ->
# a number of modules, a number of functions, a number of strings,
# a pair of lists, a pair of dictionaries, an en verb, an en sentence,
# an en number, an en noun, an en list, an en content, an en adverb,
# an en adjective, a None type and a DrawingPrimitives Context
# INDEFINITE ARTICLE ################################################################
# Returns the noun with its indefinite article
# university -> a university
# owl -> an owl
# hour -> an hour
print(15, en.noun.article("university"))
# PLURALIZATION #####################################################################
# Pluralizes the given noun:
# kitchen knife -> kitchen knives
# part-of-speech -> parts-of-speech
# wolf -> wolves
# child -> children
# You can also do en.adjective.plural().
print(16, en.noun.plural("dog"))
# EMOTIONAL VALUE ###################################################################
# Guesses whether the given noun expresses an emotion,
# by checking if there are synonyms of the word that
# are basic emotions.
# Return True or False by default.
print(17, en.noun.is_emotion("anger"))
# Or you can return a string which provides some information
# anxious -> fear
# An additional optional parameter shallow=True
# speeds up the lookup process but doesn't check as many synonyms.
# You can also use verb.is_emotion(), adjective.is_emotion() and adverb.is_emotion()
print(18, en.adjective.is_emotion("anxious", boolean=False))
# WORDNET ###########################################################################
# WordNet describes semantic relations between synonym sets.
# Returns the dictionary description:
print(19, en.noun.gloss("book"))
# A word can have multiple senses,
# for example "tree" can mean a tree in a forest but also a tree diagram,
# or a person named Sir Herbert Beerbohm Tree:
print(20, en.noun.senses("tree"))
# Return the dictionary entry for tree as in tree diagram:
print(21, en.noun.gloss("tree", sense=1))
# Return a categorization for the given word:
# book -> communication
print(22, en.noun.lexname("book"))
# Return examples of the given word:
# vehicle -> bumper car, craft, rocket, skibob, sled, steamroller, ...
print(23, en.noun.hyponym("vehicle"))
print(24, en.noun.hyponym("tree", sense=1))
# Return abstractions of the given word:
# earth -> terrestrial planet
# earth as in dirt -> material
print(25, en.noun.hypernym("earth"))
print(26, en.noun.hypernym("earth", sense=1))
# You can also execute a deep query on hypernyms and hyponyms.
# Notice how returned values become more and more abstract:
# vehicle -> transport -> intrumentation -> artifact -> unit -> physical object -> entity
print(27, en.noun.hypernyms("vehicle", sense=0))
# Return components of the given word:
# computer -> chip, diskette, keyboard, monitor, ...
print(28, en.noun.holonym("computer"))
# Return the collection in which the given word can be found:
# tree -> forest
print(29, en.noun.meronym("tree"))
# Return the semantic opposite of the word:
# black -> white
print(30, en.noun.antonym("black"))
# Find out what two words have in common:
# cat and dog -> carnivore
print(31, en.noun.meet("cat", "dog", sense1=0, sense2=0))
# Return an absurd description for the word:
# typography -> a business deal on a trivial scale
print(32, en.noun.absurd_gloss("typography"))
# The return value of a WordNet command is usually a list
# containing other lists of related words, for example:
# [['tree'], ['tree', 'tree diagram'], ['Tree', 'Sir Herbert Beerbohm Tree']]
# You can use the en.list.flatten() command to flatten the list:
print(33, en.list.flatten(en.noun.senses("tree")))
# -> ['tree', 'tree', 'tree diagram', 'Tree', 'Sir Herbert Beerbohm Tree']
# If you want a list of all nouns/verbs/adjectives/adverbs there's the
# en.wordnet.all_nouns(), en.wordnet.all_verbs() ... commands:
print(34, len(en.wordnet.all_nouns()))
# All of the commands shown here for nouns are also available for verbs, adjectives and adverbs,
# en.verbs.hypernyms("run"), en.adjective.gloss("beautiful") etc. are valid commands.
# VERB CONJUGATION ##################################################################
# NodeBox English Linguistics knows the verb tenses for about 10000 English verbs.
# Return the infinitive:
# swimming -> swim
print(35, en.verb.infinitive("swimming"))
# Return the present tense, for the given person:
# gave -> give
# gave -> he gives
print(36, en.verb.present("gave"))
print(37, en.verb.present("gave", person=3, negate=False))
# Known values for person are 1, 2, 3, "1st", "2nd", "3rd", "plural", "*".
# Just use the one you like most.
# Return the present participle tense
# be -> being
print(38, en.verb.present_participle("be"))
# Return the past tense:
# give -> gave
# be -> I wasn't
print(39, en.verb.past("give"))
print(40, en.verb.past("be", person=1, negate=True))
# Return the past participle tense:
# be -> been
print(41, en.verb.past_participle("be"))
# a list of all possible tenses:
print(42, en.verb.tenses())
# Returns the tense of the given verb:
# was -> 1st singular past
print(43, en.verb.tense("was"))
# Returns True if the given verb is in the given tense:
print(44, en.verb.is_tense("wasn't", "1st singular past", negated=True))
print(45, en.verb.is_present("does", person=1))
print(46, en.verb.is_present_participle("doing"))
print(47, en.verb.is_past_participle("done"))
# SHALLOW PARSING ###################################################################
# NodeBox English Linguistics is able to do sentence structure analysis using a
# combination of Jason Wiener's tagger and NLTK's chunker.
# The tagger assigns a part-of-speech tag to each word in the sentence using Brill's
# lexicon. A "postag" is something like NN or VBP marking words as nouns, verbs,
# determiners, pronouns, etc. The chunker is then able to group syntactic units
# in the sentence. A syntactic unit is a determiner followed by adjectives followed
# by a noun, for example, "the tasty little chicken" is a syntactic unit.
# Tag the given sentence.
# The return value is a list of (word, tag) tuples.
print(48, en.sentence.tag("this is so cool"))
# -> this/DT is/VBZ so/RB cool/JJ
# There are lots of part-of-speech tags and it takes some time getting to know them.
# This function returns a (description, examples) tuple for a given tag:
# NN -> ('noun, singular or mass', 'tiger, chair, laughter')
print(49, en.sentence.tag_description("NN"))
# Returns the chunked sentence:
# For example:
# we are going to school ->
# [['SP',
# ['NP', ('we', 'PRP')],
# ['AP',
# ['VP', ('are', 'VBP'), ('going', 'VBG'), ('to', 'TO')],
# ['NP', ('school', 'NN')]]]]
# Now what does all this mean?
# NP are noun phrases, syntactic units describing a noun, for example: a big fish.
# VP are verb phrases, units of verbs and auxillaries, for example: are going to.
# AP is a verb/argument structure, a verb phrase and a noun phrase being influenced.
# SP is a subject structure: a noun phrase which is the executor of a verb phrase
# or verb/argument structure.
from pprint import pprint
print(50)
pprint( en.sentence.chunk("he is always trying to feed her with lies") )
# A handy traverse(sentence, cmd) command lets you feed a chunked sentence
# to your own command chunk by chunk:
print(51)
s = "we are going to school"
def callback(chunk, token, tag):
if chunk != None : print(en.sentence.tag_description(chunk)[0].upper())
if chunk == None : print(token, "("+en.sentence.tag_description(tag)[0]+")")
print("")
en.sentence.traverse(s, callback)
print("")
# Find tag patterns in sentences.
print(52, en.sentence.find("The quick brown fox jumped over the lazy dog?", "(JJ) JJ NN"))
print(53, en.sentence.find("The hairy hamsters visited the cruel dentist.", "JJ NN", chunked=False))
print(54, en.sentence.find("All sorts of strange and weird and mysterious things happened.", "JJ and JJ NN"))
print(55, en.sentence.find("All sorts of strange and weird and mysterious things happened.", "JJ and JJ (NN)"))
print(56, en.sentence.find("Hairy hamsters are animals, mammals, funny creatures, or just very cool things.", "(very) (JJ) NN", chunked=False))
print(57, en.sentence.find("Wildcards are pretty wild.", "wild*", chunked=False))
print(58, en.sentence.find("Hamsters, hairy hamsters, funny hairy hamsters!", "(JJ) (JJ) NN", chunked=False))
# If you want you could feed this command with a list of your own
# regular expression units to chunk, mine are pretty basic as I'm not a linguist.
print(59, en.sentence.chunk_rules())
# SUMMARISATION #####################################################################
# NodeBox English Linguistics is able to strip keywords from a given text.
txt = """
Art can describe several kinds of things: a study of creative skill, a process of
using the creative skill, a product of the creative skill, or the audience’s
experiencing of the creative skill. The creative arts (“art”’ as discipline) are
a collection of disciplines (“arts”) which produce artworks (“art” as objects) that
is compelled by a personal drive (“art” as activity) and echoes or reflects a message,
mood, or symbolism for the viewer to interpret (“art” as experience). Artworks can
be defined by purposeful, creative interpretations of limitless concepts or ideas in
order to communicate something to another person. Artworks can be explicitly made for
this purpose or interpreted based on images or objects.
Art is something that visually stimulates an individual's thoughts, emotions, beliefs
or ideas. Art is a realized expression of an idea-it can take many different forms
and serve many different purposes.
"""
print(60, en.content.keywords(txt, top=10, nouns=True, singularize=True, filters=[]))
# Guesses a list of words that frequently occur in the given text.
# The return value is a list (length defined by top) of (count, word) tuples.
# When nouns is True, returns only nouns. The command also ignores connectives,
# numbers and tags.
# When singularize is True, attempts to singularize nouns in the text.
# The optional filters parameter is a list of words which the command should ignore.
# Assuming you would want to summarise web content you can use en.content.strip_tags()
# to strip out HTML and keep only textual content:
print(61, en.content.strip_tags("<a href='http://nodebox.net'>NodeBox</a>"))
# For example:
# from urllib import urlopen
# html = urlopen("http://news.bbc.co.uk/").read()
# meta = ["news", "health", "uk", "version", "weather", "video", "sport", "return", "read", "help"]
# print sentence_keywords(html, filters=meta)
# -> [(6, 'funeral'), (5, 'beirut'), (3, 'war'), (3, 'service'), (3, 'radio'), (3, 'mull'),
# (3, 'lebanon'), (3, 'islamist'), (3, 'function'), (3, 'female')]
# SPELLING CORRECTION ###############################################################
print(62, en.spelling.suggest("elehpant"))
print(63, en.spelling.correct("kebyoard"))