示例#1
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     if not config.transliterate:
         for i in range(2304, 2432):
             self._add_tokens(chr(i))
     else:
         self.en_dict = enchant.Dict("en_US")
         for elem in [
                 '̄', '̣', '̐', '́', '़', "'ॉ", '̃', '_', 'ऑ', '^', '…',
                 '°', '̂', '̱', 'ॅ', 'ऍ', ':'
         ]:
             self._add_tokens(elem)
     self.mappings = {
         '$': ' dollar ',
         '@': ' at the rate ',
         '+': ' plus ',
         '<': ' less than ',
         '>': ' greater than ',
         '&': ' and ',
         '%': ' percent '
     }
     self.hindi_words = [
         unicodedata.normalize('NFKC', word)
         for word in indian.words('hindi.pos')
     ]
示例#2
0
 def test_words(self):
     words = indian.words()[:3]
     self.assertEqual(words, ['মহিষের', 'সন্তান', ':'])
示例#3
0
    'English: Brown Corpus (Science Fiction)':
    lambda: brown.words(categories='science_fiction'),
    'English: Brown Corpus (Romance)':
    lambda: brown.words(categories='romance'),
    'English: Brown Corpus (Humor)':
    lambda: brown.words(categories='humor'),
    'English: NPS Chat Corpus':
    lambda: nps_chat.words(),
    'English: Wall Street Journal Corpus':
    lambda: treebank.words(),
    'Chinese: Sinica Corpus':
    lambda: sinica_treebank.words(),
    'Dutch: Alpino Corpus':
    lambda: alpino.words(),
    'Hindi: Indian Languages Corpus':
    lambda: indian.words(files='hindi.pos'),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.words(),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.words(),
    'Portuguese: Machado Corpus (Brazil)':
    lambda: machado.words(),
    'Spanish: CESS-ESP Corpus':
    lambda: cess_esp.words()
}


class CollocationsView:
    _BACKGROUND_COLOUR = '#FFF'  # white

    def __init__(self):
 def test_words(self):
     words = indian.words()[:3]
     self.assertEqual(words, ['মহিষের', 'সন্তান', ':'])
示例#5
0
 def test_words(self):
     words = indian.words()[:3]
     self.assertEqual(words, ["মহিষের", "সন্তান", ":"])
示例#6
0
            'English: Brown Corpus (Science Fiction)':
                lambda: brown.words(categories='science_fiction'),
            'English: Brown Corpus (Romance)':
                lambda: brown.words(categories='romance'),
            'English: Brown Corpus (Humor)':
                lambda: brown.words(categories='humor'),
            'English: NPS Chat Corpus':
                lambda: nps_chat.words(),
            'English: Wall Street Journal Corpus':
                lambda: treebank.words(),
            'Chinese: Sinica Corpus':
                lambda: sinica_treebank.words(),
            'Dutch: Alpino Corpus':
                lambda: alpino.words(),
            'Hindi: Indian Languages Corpus':
                lambda: indian.words(files='hindi.pos'),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.words(),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.words(),
            'Portuguese: Machado Corpus (Brazil)':
                lambda: machado.words(),
            'Spanish: CESS-ESP Corpus':
                lambda: cess_esp.words()
           }

class CollocationsView:
    _BACKGROUND_COLOUR='#FFF' #white

    def __init__(self):
        self.queue = q.Queue()
示例#7
0
    "English: Brown Corpus (Science Fiction)":
    lambda: brown.words(categories="science_fiction"),
    "English: Brown Corpus (Romance)":
    lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)":
    lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus":
    lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus":
    lambda: treebank.words(),
    "Chinese: Sinica Corpus":
    lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus":
    lambda: alpino.words(),
    "Hindi: Indian Languages Corpus":
    lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)":
    lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)":
    lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)":
    lambda: machado.words(),
    "Spanish: CESS-ESP Corpus":
    lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
示例#8
0
 def test_words(self):
     words = indian.words()[:3]
     self.assertEqual(words, ["মহিষের", "সন্তান", ":"])
示例#9
0
Created on Mon Oct 12 11:11:06 2015

@author: suppu
"""
from nltk.corpus import indian
'''
Let us generate a file having sentences in indian languages. The file is generated from the indian languages scorpus available
'''
print "Number of charachetrs is:"
for f in indian.fileids():
    print f
    print len(indian.raw(f))
print "No of words in each language are:"
for f in indian.fileids():
    print f
    print len(indian.words(f))
print "Number of sentences in each language:"
for f in indian.fileids():
    print f
    print len(indian.sents(f))
'''POS for hindi
'''
hindi_sent = indian.sents("hindi.pos")
hsent = file("hws.txt", 'w')
for i in hindi_sent:
    hsent.write(" ".join(i))
hpos = indian.tagged_sents("hindi.pos")
hpossent = open("hpossent.txt", 'w')
hpossent.seek(0)
for i in hpos:
    for j in i:
示例#10
0
from nltk.corpus import indian

print("Files of Indian languages:-")
# check files for each languare in NLTK
print(indian.fileids())
print()

print("Language details :-")
# find no. of characters in each language
for f in indian.fileids():
    print("Language :-", f)
    print(
        "     No of Characters",
        len(indian.raw(f)),
    )
    print("     No of words :-", len(indian.words(f)))
    print("     No of Sentences :-", len(indian.sents(f)))
print()

print("Checking raw sentences of languages:-")
# print(indian.raw(indian.raw('bangla.pos'))
# print(indian.raw(indian.raw('hindi.pos'))
# print(indian.raw(indian.raw('marathi.pos'))
# print(indian.raw(indian.raw('telugu.pos'))

print("Printing & writing the sentences to a file,  from Marathi language")
sentencesMarathi = open("marathiSentences.txt", "w")
# This will print sentence as a list of words
for sentence in indian.sents('marathi.pos'):
    #print(sentence)
    sentencesMarathi.write(" ".join(sentence))
示例#11
0
_DEFAULT = "English: Brown Corpus (Humor)"
_CORPORA = {
    "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
    "English: Brown Corpus": lambda: brown.words(),
    "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
示例#12
0
print(nltk.corpus.inaugural.words())
# nltk.download('state_union')
print(nltk.corpus.state_union.words())
# nltk.download('webtext')
print(nltk.corpus.webtext.words())
# tagged corpora
print(brown.words())
print(brown.tagged_words())
print(brown.sents())  # doctest: +ELLIPSIS
print(brown.tagged_sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(brown.paras(
    categories='reviews'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(brown.tagged_paras(
    categories='reviews'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('indian')
print(indian.words())  # doctest: +SKIP
print(indian.tagged_words())  # doctest: +SKIP
# nltk.download('universal_tagset')
print(brown.tagged_sents(
    tagset='universal'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(conll2000.tagged_words(
    tagset='universal'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# chunked corpora
print(conll2000.sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2000.chunked_sents()[:2]:
    print(tree)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('conll2002')
print(conll2002.sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2002.chunked_sents()[:2]:
    print(tree)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('semcor')
示例#13
0
#This is for my corpus (indian)

import nltk
from nltk.corpus import indian
import matplotlib as cdf

print(indian.raw())
print(indian.fileids())
print(indian.sents())

import matplotlib

word1 = 'country'
word2 = 'city'
cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                               for fileid in indian.fileids()
                               for w in indian.words(fileid)
                               for target in [word1, word2]
                               if w.lower().startswith(target))
cfd.plot()
示例#14
0
    "English: Brown Corpus": lambda: brown.words(),
    "English: Brown Corpus (Press)": lambda: brown.words(
        categories=["news", "editorial", "reviews"]
    ),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(
        categories="science_fiction"
    ),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)