def fun6(): # 词汇工具:Toolbox和Shoebox # Toolbox文件由一些条目的集合组成, 其中每个条目由一个或多个字段组成。大多数字段都是可选的或 # 重复的, 这意味着这个词汇资源不能作为一个表格或电子表格来处理 # 条目包括一系列的“属性 - 值”对, 如('ps', 'V'), 表示词性是 # 'V'(动词), ('ge', 'gag') 表示英文注释是'gag'。最后的3个配对包含一个罗托卡特语例句及其巴布亚皮钦语和英语的翻译 from nltk.corpus import toolbox print toolbox.entries('rotokas.dic')
translate['chien'] translate['jeter'] de2en = swadesh.entries(['de', 'en']) es2en = swadesh.entries(['es', 'en']) translate.update(dict(de2en)) translate.update(dict(es2en)) translate['Hund'] translate['perro'] languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la'] for i in [139, 140, 141, 142]: print(swadesh.entries(languages)[i]) from nltk.corpus import toolbox toolbox.entries('rotokas.dic') from nltk.corpus import wordnet as wn wn.synsets('motorcar') wn.synset('car.n.01').lemma_names() wn.synset('car.n.01').definition() wn.synset('car.n.01').examples() wn.synset('car.n.01').lemmas() wn.lemma('car.n.01.automobile') wn.lemma('car.n.01.automobile').synset() wn.lemma('car.n.01.automobile').name() wn.synsets('car') for synset in wn.synsets('car'):
import nltk from nltk.corpus import toolbox print toolbox.entries('rotokas.dic')[0:2]
print(swadesh.fileids()) fr2en = swadesh.entries(['fr','en']) print(fr2en) translate = dict(fr2en) print(translate['chien']) de2en = swadesh.entries(['de','en']) es2en = swadesh.entries(['es','en']) translate.update(dict(de2en)) translate.update(dict(es2en)) print(translate['Hund']) print(translate['perro']) languages = ['en','de','nl','es','fr','pt','la'] for i in [139,140,141,142]: print(swadesh.entries(languages)[i]) ''' #词汇工具:ToolBox和Shoebox from nltk.corpus import toolbox print(toolbox.entries('rotokas.dic'))
('sing', 'singen', 'zingen', 'cantar', 'chanter', 'cantar', 'canere') ('play', 'spielen', 'spelen', 'jugar', 'jouer', 'jogar, brincar', 'ludere') ('float', 'schweben', 'zweven', 'flotar', 'flotter', 'flutuar, boiar', 'fluctuare') ''' # Shoebox and Toolbox Lexicons # Perhaps the single most popular tool used by linguists for managing data is Toolbox, # previously known as Shoebox since it replaces the field linguist's traditional shoebox # full of file cards. Toolbox is freely downloadable from # http://www.sil.org/computing/toolbox/. # A Toolbox file consists of a collection of entries, where each entry is made up of one or more fields. # Most fields are optional or repeatable, which means that this kind of lexical resource cannot be treated as a table or spreadsheet. # Here is a dictionary for the Rotokas language. We see just the first entry, for the word kaa meaning "to gag": from nltk.corpus import toolbox toolbox.entries('rotokas.dic') # @UndefinedVariable ''' [('kaa', [('ps', 'V'), ('pt', 'A'), ('ge', 'gag'), ('tkp', 'nek i pas'), ('dcsv', 'true'), ('vx', '1'), ('sc', '???'), ('dt', '29/Oct/2005'), ('ex', 'Apoka ira kaaroi aioa-ia reoreopaoro.'), ('xp', 'Kaikai i pas long nek bilong Apoka bikos em i kaikai na toktok.'), ('xe', 'Apoka is gagging from food while talking.')]), ...] ''' # Entries consist of a series of attribute-value pairs, # like ('ps', 'V') to indicate that the part-of-speech is 'V' (verb), # and ('ge', 'gag') to indicate that the gloss-into-English is 'gag'. # The last three pairs contain an example sentence in Rotokas and its translations into # Tok Pisin and English. # The loose structure of Toolbox files makes it hard for us to do much more with them at this stage. XML provides a powerful way to process this kind of corpus and we will return to this topic in 11.. # The Rotokas language is spoken on the island of Bougainville, Papua New Guinea. This lexicon was contributed to NLTK by Stuart Robinson.
syllable = ["N", "IH0", "K", "S"] word = [word for word, pron in entries if pron[-4:] == syllable] print("syllable", word) def stress(pron): return [char for phone in pron for char in phone if char.isdigit()] stress_word = [w for w, pron in entries if stress(pron) == ["0", "1", "0", "2", "0"]] print("stress_word", stress_word) #----------------------- from nltk.corpus import toolbox entries = toolbox.entries("rotokas.dic") print(entries) #----------------------- from nltk.corpus import wordnet as wn lemmas = wn.lemmas("car") synsets = wn.synsets("motorcar") for synset in wn.synsets("car"): print(synset.lemma_names) print("lemmas", wn.lemmas("car")) lemmas = wn.synset("car.n.01").lemmas lemma_name = wn.synset("car.n.01").lemma_names
def toolbox(): toolbox.entries('rotokas.dic')