示例#1
0
def compareWordlist():

    swadesh.fileids()
    swadesh.words('en')

    fr2en = swadesh.entries(['fr', 'en'])
    fr2en

    translate = dict(fr2en)
    translate['chien']
    translate['jeter']

    de2en = swadesh.entries(['de', 'en'])    # German-English
    es2en = swadesh.entries(['es', 'en'])    # Spanish-English
    translate.update(dict(de2en))
    translate.update(dict(es2en))
    translate['Hund']
    translate['perro']

    languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']
    for i in [139, 140, 141, 142]:
        print swadesh.entries(languages)[i]
示例#2
0
# 斯瓦迪士核心词列表
from nltk.corpus import swadesh
print(swadesh.fileids())
print(swadesh.words('en'))

# entries()方法中指定一个语言链表来访问多语言中的同源词
fr2en = swadesh.entries(['fr', 'en'])
print(fr2en)
示例#3
0
# In[11]:

from nltk.corpus import swadesh
#idiomas disponibles
print(swadesh.fileids())

# In[12]:

print(swadesh.words('en'))

# Hacemos una traducción del **frances** al **español**

# In[13]:

fr2es = swadesh.entries(['fr', 'es'])
print(fr2es)

# In[14]:

translate = dict(fr2es)
translate['chien']

# In[15]:

translate['jeter']

# # WordNet

# ## Referencias
#
# hier you see a one-pair dictionary with the language combination German to Englisch and vice a versa:

from nltk.corpus import swadesh

de_to_en = swadesh.entries(["de", "en"])
en_to_de = swadesh.entries(["en", "de"])

translate = dict(de_to_en)
translate1 = dict(en_to_de)

translate.update(dict(translate1))

print(translate["Hund"])
print(translate["dog"])
示例#5
0
from nltk.corpus import swadesh
from tkinter import *

en2de = swadesh.entries(['en', 'de'])  # English-German
translate = dict(en2de)


def translate_word():
    eng_word = english_word.get().lower()
    german_words = translate[eng_word]
    list1.insert(END, german_words)


window = Tk()
window.wm_title("Translater")

#Label for English Word and German word
l1 = Label(window, text="English Word")
l1.grid(row=1, column=0)

l2 = Label(window, text="German Word")
l2.grid(row=1, column=2)

#Entry text
english_word = StringVar()
e1 = Entry(window, textvariable=english_word)
e1.grid(row=1, column=1)

list1 = Listbox(window, height=6, width=15)
list1.grid(row=2, column=3, columnspan=3)
                               for target in ['men', 'women', 'people']
                               if w.lower() == target)
cfd.tabulate()


# In[16]:

cfd.plot()


# #### 6.  In the discussion of comparative wordlists, we created an object called translate which you could look up using words in both German and Italian in order to get corresponding words in English. What problem might arise with this approach? Can you suggest a way to avoid this problem?

# In[17]:

from nltk.corpus import swadesh
de2en = swadesh.entries(['de', 'en'])
it2en = swadesh.entries(['it', 'en'])
translate2 = dict(de2en)
translate2.update(dict(it2en))
len(translate2)


# In[18]:

translate2['bianco']


# In[19]:

translate2['Hund']
from nltk.corpus import stopwords
stopwords.words('english')

def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return len(content) / len(text)

content_fraction(nltk.corpus.reuters.words())
content_fraction(nltk.corpus.inaugural.words())

# Translator
from nltk.corpus import swadesh
languages = ['en', 'ro', 'es', 'fr', 'pt', 'la']
for i in [100, 141, 143]:
    print swadesh.entries(languages)[i]

# Wordnet 
#dictionary of English

from nltk.corpus import wordnet as wn
wn.synsets('motorcar')
wn.synset('car.n.01').lemma_names
wn.synset('car.n.01').definition
for synset in wn.synsets('car')[1:3]:
    print synset.lemma_names

# Depth of a synset
wn.synset('whale.n.02').min_depth()
wn.synset('vertebrate.n.01').min_depth()
wn.synset('walk.v.01').entailments() #Walking involves stepping
示例#8
0
# Instead of the list of tuples, we can access the cmu dictionary as a
# python dictionary
prondict = nltk.corpus.cmudict.dict()
prondict['fire']

# 4.3 Comparative wordlist
#
# The swadesh comparative word list is a list of 200 common words in
# multiple languages.
from nltk.corpus import swadesh
swadesh.fileids()
swadesh.words('en')

# Use the word list to construct a translator
fr2en = swadesh.entries(['fr', 'en'])
translate = dict(fr2en)
translate['chien']
translate['jeter']

# We can also add in extra language by updating our dictionary, german
# and spanish are added.
de2en = swadesh.entries(['de', 'en'])
es2en = swadesh.entries(['es', 'en'])
translate.update(dict(de2en))
translate.update(dict(es2en))

# Spanish for dog
translate['perro']
# German for dog
translate['Hund']
示例#9
0
# 在词典中寻找单词的发音
text = ['natural', 'language', 'processing']
[ph for w in text for ph in prondict[w][0]]

# 加[0]是因为natural有两个发音,取其中一个就好了
[ph for w in text for ph in prondict[w]]
prondict['natural']

# P70 4.3. 比较词表(Swadesh wordlists),包括几种语言的约200个常用词的列表,可以用于比较两个语言之间的差别,也可以用于不同语言的单词翻译
from nltk.corpus import swadesh

swadesh.fileids()
swadesh.words('en')

fr2en = swadesh.entries(['fr', 'en'])
fr2en
translate = dict(fr2en)
translate['chien']

de2en = swadesh.entries(['de', 'en'])
translate.update(dict(de2en))
es2en = swadesh.entries(['es', 'en'])
translate.update(dict(es2en))
translate['jeter']
translate['Hund']
translate['perro']

languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']
for i in [139, 140, 141, 142]:
    print(swadesh.entries(languages)[i])
示例#10
0
tree.member_meronyms()  # 包含tree的部分--部分词
tree.part_meronyms()  # 子结构 --部分词
tree.substance_meronyms()  # 物质-部分词
tree.member_holonyms()  # 包含树的 --上位次,这里一般是森林
tree.part_holonyms()  # 子结构--上位词,这里肯定为空
tree.substance_holonyms()  # 物质--上位词,一般也为空,因为这个集合不想交
"""
6. ○在比较词表的讨论中, 我们创建了一个对象叫做translate, 通过它你可以使用德语
和意大利语词汇查找对应的英语词汇。这种方法可能会出现什么问题?你能提出一个办
法来避免这个问题吗?
如何知道输入的语言是德语还是意大利语呢,特别是在意大利语与德语词汇相同当语义不同的时候
"""
# 第六题
from nltk.corpus import swadesh

fr2en = swadesh.entries(['fr', 'en'])
translate = dict(fr2en)
translate.update(swadesh.entries(['it', 'en']))
# 可能会存在的问题: 如何知道输入的语言是德语还是意大利语呢,特别是在意大利语与德语词汇相同当语义不同的时候
"""
7. ○根据Strunk和 White 的《 Elements of Style》, 词 however在句子开头使用是“ in wh
atever way” 或“ to whatever extent” 的意思,而没有“ nevertheless” 的意思。他们给
出了正确用法的例子: However you advise him, he will probably do as he thinks bes
t.( http://www.bartleby.com/141/strunk3.html)。使用词汇索引工具在我们一直在思考的
各种文本中研究这个词的实际用法。也可以看 LanguageLog发布在 http://itre.cis.upenn.
edu/~myl/languagelog/archives/001913.html上的 “ Fossilized prejudices about ‘ however’”。
"""
"""
8. ◑在名字语料库上定义一个条件频率分布,显示哪个首字母在男性名字中比在女性名字
中更常用(见图 2-7)。
"""
示例#11
0
def compare_germanic_and_latin_words():
    languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'it', 'la']
    for i in [139, 140, 141, 142]:
        print(swadesh.entries(languages)[i])
示例#12
0
from nltk.corpus import swadesh
from nltk.corpus import wordnet as wn

# print(swadesh.fileids())

es2en = swadesh.entries(['es', 'en'])

translate = dict(es2en)

words = ['cenizas', 'nieve', 'hincharse']

for w in words:
    print(w, " = ", translate[w])

# for w in translate:
# print(w)

computer = wn.synsets('computer')[0]
print("The hypernyms of computer are ", computer.hypernyms())
print("The hyponyms of computer are ", computer.hyponyms())

automobile = wn.synsets('automobile')[0]
print("The meronyms of automobile are ", automobile.part_meronyms())

bird = wn.synsets('bird')[0]
print("The holonyms of bird are ", bird.member_holonyms())
# -*-coding:utf-8-*-
"""
This module is an example for Swadesh corpus retrieval
"""

import re
import numpy as np
from nltk.corpus import swadesh

__author__ = "besnier"

germanic_languages = ["en", "de", "nl"]
roman_languages = ["fr", "es", "it"]
alphabet = list('azertyuiopqsdfghjklmwxcvbn')

to_aligner_ger = swadesh.entries(germanic_languages)
to_aligner_rom = swadesh.entries(roman_languages)


def vocabulary_retrieve(languages, normalize):
    """
    Load and normalize corpora according to chosen languages
    :param languages:
    :param normalize:
    :return:
    """
    to_align = swadesh.entries(languages)
    normalised_words = []
    characters = set()
    for i, mots in enumerate(to_align):
        normalised_words.append([])
import nltk

from nltk.corpus import swadesh

# swadesh dict
print swadesh.fileids()
print swadesh.words('en')

# a samlple dict
fr2en = swadesh.entries(['fr','en'])
print fr2en[:5]
translate = dict(fr2en)
print translate['chien']
print translate['jeter']

# compare germ with latin
language = ['en','de','nl','es','fr','pt','la']
for i in [139,140,141,142]:
    print swadesh.entries(language)[i]
示例#15
0
def translate(frm, to, word):
    from nltk.corpus import swadesh
    frm2to = swadesh.entries([frm, to])  # from -> to
    translate = dict(frm2to)
    return translate[word]
        words = cfd[template].keys()
        wordlist = ' '.join(words)
        print(template, wordlist[:70] + "...")

prondict = nltk.corpus.cmudict.dict()

text = ['natural', 'language', 'processing']
[ph for w in text for ph in prondict[w][0]]

#比较词典
from nltk.corpus import swadesh
swadesh.fileids()

swadesh.words('en')

fr2en = swadesh.entries(['fr', 'en'])
fr2en

translate = dict(fr2en)
translate['chien']
translate['jeter']

de2en = swadesh.entries(['de', 'en'])  # German-English
es2en = swadesh.entries(['es', 'en'])  # Spanish-English
translate.update(dict(de2en))
translate.update(dict(es2en))
translate['Hund']
translate['perro']

languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']
for i in [139, 140, 141, 142]:
def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return len(content) / len(text)


content_fraction(nltk.corpus.reuters.words())
content_fraction(nltk.corpus.inaugural.words())

# Translator
from nltk.corpus import swadesh

languages = ['en', 'ro', 'es', 'fr', 'pt', 'la']
for i in [100, 141, 143]:
    print swadesh.entries(languages)[i]

# Wordnet
#dictionary of English

from nltk.corpus import wordnet as wn

wn.synsets('motorcar')
wn.synset('car.n.01').lemma_names
wn.synset('car.n.01').definition
for synset in wn.synsets('car')[1:3]:
    print synset.lemma_names

# Depth of a synset
wn.synset('whale.n.02').min_depth()
wn.synset('vertebrate.n.01').min_depth()
示例#18
0
lang2country = {languages[i] : country_codes[i] for i in range(len(languages))}

#Country_codes converted to pandas dataframe so as to use them in maps

country_codes = pd.DataFrame(country_codes)

#list of words of different languages stored
x = []

#index of the particular language in the above list 
indx = dict()

i = 0 
for lang in languages:
	a = swadesh.entries(['en',lang])
	#storing words of each language in list
	x.append(swadesh.entries([lang]))
	#storing index of each language
	indx[lang] = i
	i = i + 1 

country_codes['lang'] = languages


#Check box on sidebar to be unchecked if user wants to see map 
if  st.sidebar.checkbox('Check to see country\'s language code' ,False):
	st.subheader('Hover your mouse over countries to know the language code of various countries')
	st.markdown('Only blue coloured countries are available for translation')
	#Map with country_codes being used as locations
	fig = px.choropleth(country_codes,locations =country_codes[:][0] ,
示例#19
0
文件: 06.py 项目: kouheiszk/nltk
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import nltk
from nltk.corpus import swadesh

swadesh.fileids()
swadesh.words("en")
fr2en = swadesh.entries(["fr", "en"])
fr2en
translate = dict(fr2en)
translate["chien"]
translate["jeter"]
示例#20
0
"""2.	Використовуючи компаративний словник знайти для німецької,
 італійської та англійської мов близькі слова. Чи можуть отримані
 результати використовуватися для здійснення перекладу?"""
from nltk.corpus import swadesh
print(swadesh.entries(['de', 'en', 'it']))
# ('lang', 'long', 'lungo') - длинный
# 'Nase', 'nose', 'naso' - нос
# 'Name', 'name', 'nome' - имя
示例#21
0
__author__ = 'lizhifeng'
from nltk.corpus import  swadesh

print swadesh.fileids()
print swadesh.words('en')

fr2en = swadesh.entries(['fr', 'en'])
print fr2en

translate = dict(fr2en)
print translate["chien"]
# [Synset('forest.n.01')]
wn.synset('electrode.n.01').part_holonyms() # @UndefinedVariable
# [Synset('battery.n.02'), Synset('electrolytic_cell.n.01'), Synset('electronic_equipment.n.01'), Synset('tube.n.02')]
wn.synset('terminal.n.02').part_holonyms() # @UndefinedVariable
# [Synset('battery.n.02'), Synset('electrical_device.n.01')]
wn.synset('calcium_carbonate.n.01').substance_holonyms() # @UndefinedVariable
# [Synset('calcite.n.01'), Synset('chalk.n.01')]

# 6.☼ In the discussion of comparative wordlists, 
# we created an object called translate which you could look up using words 
# in both German and Spanish in order to get corresponding words in English. 
# What problem might arise with this approach? 
# Can you suggest a way to avoid this problem?
from nltk.corpus import swadesh
translate = dict()
de2en = swadesh.entries(['de', 'en'])    # German-English
es2en = swadesh.entries(['es', 'en'])    # Spanish-English
translate.update(dict(de2en))
translate.update(dict(es2en))
translate['Hund']
# 'dog'
translate['perro']
# 'dog'
# Problem: 查询值没有做处理,不够robust. 例如查询translate['hund']就不能得到正确结果
# Solution: 构造好(key,value)pair,再update一次,可根据需要适当对键值进行处理,比如忽略大小写,单复数变换,stem之类的。
# 重复键值对会保持下来
translate.update(dict( (key.lower(),value) for key,value in de2en))
translate.update(dict( (key.lower(),value) for key,value in es2en))

# 7.☼ According to Strunk and White's Elements of Style, the word however, 
# used at the start of a sentence, means "in whatever way" or "to whatever extent", 
示例#23
0
import nltk
from nltk.corpus import swadesh

en2ca = swadesh.entries(['en','ca'])
translate = dict(en2ca)
print translate['dog']
from nltk.corpus import swadesh

print(swadesh.fileids())
# prints out the language two-letter-codes.

print()

en_to_de = swadesh.entries(["en", "de"])
print(en_to_de)
# prints out a list of English words translated into the German language.

print()

translate = dict(en_to_de)
print(translate["dog"])
# prints out a German translation of "dog".