def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if not config.transliterate: for i in range(2304, 2432): self._add_tokens(chr(i)) else: self.en_dict = enchant.Dict("en_US") for elem in [ '̄', '̣', '̐', '́', '़', "'ॉ", '̃', '_', 'ऑ', '^', '…', '°', '̂', '̱', 'ॅ', 'ऍ', ':' ]: self._add_tokens(elem) self.mappings = { '$': ' dollar ', '@': ' at the rate ', '+': ' plus ', '<': ' less than ', '>': ' greater than ', '&': ' and ', '%': ' percent ' } self.hindi_words = [ unicodedata.normalize('NFKC', word) for word in indian.words('hindi.pos') ]
def test_words(self): words = indian.words()[:3] self.assertEqual(words, ['মহিষের', 'সন্তান', ':'])
'English: Brown Corpus (Science Fiction)': lambda: brown.words(categories='science_fiction'), 'English: Brown Corpus (Romance)': lambda: brown.words(categories='romance'), 'English: Brown Corpus (Humor)': lambda: brown.words(categories='humor'), 'English: NPS Chat Corpus': lambda: nps_chat.words(), 'English: Wall Street Journal Corpus': lambda: treebank.words(), 'Chinese: Sinica Corpus': lambda: sinica_treebank.words(), 'Dutch: Alpino Corpus': lambda: alpino.words(), 'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(), 'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(), 'Spanish: CESS-ESP Corpus': lambda: cess_esp.words() } class CollocationsView: _BACKGROUND_COLOUR = '#FFF' # white def __init__(self):
def test_words(self): words = indian.words()[:3] self.assertEqual(words, ["মহিষের", "সন্তান", ":"])
'English: Brown Corpus (Science Fiction)': lambda: brown.words(categories='science_fiction'), 'English: Brown Corpus (Romance)': lambda: brown.words(categories='romance'), 'English: Brown Corpus (Humor)': lambda: brown.words(categories='humor'), 'English: NPS Chat Corpus': lambda: nps_chat.words(), 'English: Wall Street Journal Corpus': lambda: treebank.words(), 'Chinese: Sinica Corpus': lambda: sinica_treebank.words(), 'Dutch: Alpino Corpus': lambda: alpino.words(), 'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(), 'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(), 'Spanish: CESS-ESP Corpus': lambda: cess_esp.words() } class CollocationsView: _BACKGROUND_COLOUR='#FFF' #white def __init__(self): self.queue = q.Queue()
"English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self):
Created on Mon Oct 12 11:11:06 2015 @author: suppu """ from nltk.corpus import indian ''' Let us generate a file having sentences in indian languages. The file is generated from the indian languages scorpus available ''' print "Number of charachetrs is:" for f in indian.fileids(): print f print len(indian.raw(f)) print "No of words in each language are:" for f in indian.fileids(): print f print len(indian.words(f)) print "Number of sentences in each language:" for f in indian.fileids(): print f print len(indian.sents(f)) '''POS for hindi ''' hindi_sent = indian.sents("hindi.pos") hsent = file("hws.txt", 'w') for i in hindi_sent: hsent.write(" ".join(i)) hpos = indian.tagged_sents("hindi.pos") hpossent = open("hpossent.txt", 'w') hpossent.seek(0) for i in hpos: for j in i:
from nltk.corpus import indian print("Files of Indian languages:-") # check files for each languare in NLTK print(indian.fileids()) print() print("Language details :-") # find no. of characters in each language for f in indian.fileids(): print("Language :-", f) print( " No of Characters", len(indian.raw(f)), ) print(" No of words :-", len(indian.words(f))) print(" No of Sentences :-", len(indian.sents(f))) print() print("Checking raw sentences of languages:-") # print(indian.raw(indian.raw('bangla.pos')) # print(indian.raw(indian.raw('hindi.pos')) # print(indian.raw(indian.raw('marathi.pos')) # print(indian.raw(indian.raw('telugu.pos')) print("Printing & writing the sentences to a file, from Marathi language") sentencesMarathi = open("marathiSentences.txt", "w") # This will print sentence as a list of words for sentence in indian.sents('marathi.pos'): #print(sentence) sentencesMarathi.write(" ".join(sentence))
_DEFAULT = "English: Brown Corpus (Humor)" _CORPORA = { "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(), "English: Brown Corpus": lambda: brown.words(), "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]), "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue) self.top = Tk() self._init_top(self.top)
print(nltk.corpus.inaugural.words()) # nltk.download('state_union') print(nltk.corpus.state_union.words()) # nltk.download('webtext') print(nltk.corpus.webtext.words()) # tagged corpora print(brown.words()) print(brown.tagged_words()) print(brown.sents()) # doctest: +ELLIPSIS print(brown.tagged_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(brown.paras( categories='reviews')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(brown.tagged_paras( categories='reviews')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('indian') print(indian.words()) # doctest: +SKIP print(indian.tagged_words()) # doctest: +SKIP # nltk.download('universal_tagset') print(brown.tagged_sents( tagset='universal')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(conll2000.tagged_words( tagset='universal')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # chunked corpora print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2000.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('conll2002') print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2002.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('semcor')
#This is for my corpus (indian) import nltk from nltk.corpus import indian import matplotlib as cdf print(indian.raw()) print(indian.fileids()) print(indian.sents()) import matplotlib word1 = 'country' word2 = 'city' cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in indian.fileids() for w in indian.words(fileid) for target in [word1, word2] if w.lower().startswith(target)) cfd.plot()
"English: Brown Corpus": lambda: brown.words(), "English: Brown Corpus (Press)": lambda: brown.words( categories=["news", "editorial", "reviews"] ), "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words( categories="science_fiction" ), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue) self.top = Tk() self._init_top(self.top)