Пример #1
0
    'English: Brown Corpus (Romance)':
    lambda: brown.words(categories='romance'),
    'English: Brown Corpus (Humor)':
    lambda: brown.words(categories='humor'),
    'English: NPS Chat Corpus':
    lambda: nps_chat.words(),
    'English: Wall Street Journal Corpus':
    lambda: treebank.words(),
    'Chinese: Sinica Corpus':
    lambda: sinica_treebank.words(),
    'Dutch: Alpino Corpus':
    lambda: alpino.words(),
    'Hindi: Indian Languages Corpus':
    lambda: indian.words(files='hindi.pos'),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.words(),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.words(),
    'Portuguese: Machado Corpus (Brazil)':
    lambda: machado.words(),
    'Spanish: CESS-ESP Corpus':
    lambda: cess_esp.words()
}


class CollocationsView:
    _BACKGROUND_COLOUR = '#FFF'  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
Пример #2
0
 def test_words(self):
     words = floresta.words()[:10]
     txt = "Um revivalismo refrescante O 7_e_Meio é um ex-libris de a"
     self.assertEqual(words, txt.split())
Пример #3
0
            'English: Brown Corpus (Romance)':
                lambda: brown.words(categories='romance'),
            'English: Brown Corpus (Humor)':
                lambda: brown.words(categories='humor'),
            'English: NPS Chat Corpus':
                lambda: nps_chat.words(),
            'English: Wall Street Journal Corpus':
                lambda: treebank.words(),
            'Chinese: Sinica Corpus':
                lambda: sinica_treebank.words(),
            'Dutch: Alpino Corpus':
                lambda: alpino.words(),
            'Hindi: Indian Languages Corpus':
                lambda: indian.words(files='hindi.pos'),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.words(),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.words(),
            'Portuguese: Machado Corpus (Brazil)':
                lambda: machado.words(),
            'Spanish: CESS-ESP Corpus':
                lambda: cess_esp.words()
           }

class CollocationsView:
    _BACKGROUND_COLOUR='#FFF' #white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
 def test_words(self):
     words = floresta.words()[:10]
     txt = "Um revivalismo refrescante O 7_e_Meio é um ex-libris de a"
     self.assertEqual(words, txt.split())
Пример #5
0
from nltk.corpus import mac_morpho, floresta
from collections import Counter
import pickle, re

palavras = re.findall(r"[\w'-]+", (open('wordlists/palavras.txt',
                                        encoding='utf8').read()).lower())
corpus = list(mac_morpho.words()) + list(floresta.words())
corpus = [x.lower() for x in corpus]

tudo = corpus + palavras

contagem = Counter(tudo)

with open("wordlists/dicionario.bin", 'wb') as arq:
    pickle.dump(contagem, arq)
def dicionario():

    global dic

    for word in floresta.words():
        dic.append(unidecode(word.lower()))
Пример #7
0
# ◑ Obtain some tagged data for another language, and train and evaluate a variety of taggers on it. If the language is morphologically complex, or if there are any orthographic clues (e.g. capitalization) to word classes, consider developing a regular expression tagger for it (ordered after the unigram tagger, and before the default tagger). How does the accuracy of your tagger(s) compare with the same taggers run on English data? Discuss any issues you encounter in applying these methods to the language.

import nltk
from nltk.corpus import floresta

text = floresta.words()
floresta_tagged_sents = floresta.tagged_sents()
floresta_tagged_words = floresta.tagged_words()
fd = nltk.FreqDist(text)
cfd = nltk.ConditionalFreqDist(floresta_tagged_words)
most_freq_words = fd.most_common(100)

# lookup tagger for likely tags
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)

# trained unigram tagger
size = int(len(floresta_tagged_sents) * 0.9)
training_data = tagged_text[:size]
test_data = tagged_text[size:]

uni_tagger = nltk.UnigramTagger(model=training_data)
uni_tagger.evaluate(test_data)
Пример #8
0
# ◑ Obtain some tagged data for another language, and train and evaluate a variety of taggers on it. If the language is morphologically complex, or if there are any orthographic clues (e.g. capitalization) to word classes, consider developing a regular expression tagger for it (ordered after the unigram tagger, and before the default tagger). How does the accuracy of your tagger(s) compare with the same taggers run on English data? Discuss any issues you encounter in applying these methods to the language.

import nltk
from nltk.corpus import floresta

text = floresta.words()
floresta_tagged_sents = floresta.tagged_sents()
floresta_tagged_words = floresta.tagged_words()
fd = nltk.FreqDist(text)
cfd = nltk.ConditionalFreqDist(floresta_tagged_words)
most_freq_words = fd.most_common(100)

# lookup tagger for likely tags
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)


# trained unigram tagger
size = int(len(floresta_tagged_sents) * 0.9)
training_data = tagged_text[:size]
test_data = tagged_text[size:]

uni_tagger = nltk.UnigramTagger(model=training_data)
uni_tagger.evaluate(test_data)
Пример #9
0
texto_tok_nosw = []
for token in texto_tok:
    if token not in stopwords_pt:
        texto_tok_nosw.append(token)

print('Lista de tokens com stop words removidas:', texto_tok_nosw)
#
# COMO REALIZAR STEMIZAÇAO USANDO O NLTK:
stemmer = nltk.stem.RSLPStemmer(
)  # importamos um stemmer para lingua portuguesa.
print(stemmer.stem('amor'))
print(stemmer.stem('amar'))
print(stemmer.stem('amaria'), '\n')

# REALIZANDO CONTAGEM DE FREQUENCIAS COM O NLTK:
fd = nltk.FreqDist(floresta.words(
))  # floresta.words() retorna a lista de palavras que compõe o córpus.
print('Objeto FrequencyDist construído:', fd, '\n')
print('Dez palavras mais comuns no córpus:', fd.most_common(10), '\n')
print('Número total de palavras no córpus:', fd.N(), '\n')
print(
    'Número de tipos do córpus, ou número total de palavras no seu vocabulário:',
    len(fd), '\n')
fd.plot(
    50
)  # gera um plot da distribuiçãonde frequência das 50 palavras mais comuns no córpus.
#
# EXEMPLO DE USO DA CLASSE TEXT, DISPONIBILIZADA NO NLTK:
text = nltk.Text(floresta.words())
print('Objeto Text criado:', text, '\n')

print("Busca de ocorrências pala palavra 'lugar' no córpus:")
Пример #10
0
print('total_arvores treebank=',acc)


#contando total de árvores no único texto disponível do corpus 'floresta'
acc=0
for i in floresta.fileids()[:1]:
  lf = len(floresta.parsed_sents(i))
  acc = acc+lf
print('total_arvores floresta=',acc)

#habilitando novamente std_err para que as mensagens de erro voltem a ser impressas
enable_stderr(r)

"""Vamos agora inspecionar os totais de palavras de cada corpus. Percebam que o corpus floresta é bem mais rico em número de palavras."""

print("floresta.words=",len(floresta.words()), "\ntreebank.words=", len(treebank.words()))

"""Vamos agora aprender a percorrer as árvores de parsing do corpus 'floresta' e normalizar as regras de produção, evitando aquelas que não podem ser normalizadas.  

Para fazer isso, vamos empregar tratamento de exceções. Ao executar o código, é possível perceber que apenas uma pequena quantidade de árvores de  parsing não pode ser normalizada.
"""

#desabilitando std_err. Para habilitar, basta chamar enable_stderr(r)
r=disable_stderr()

from nltk import treetransforms
from nltk import induce_pcfg
from nltk import Nonterminal

#contadores para árvores ok e para árvores com falha
ok=0;
Пример #11
0
 def get_words(self, document_id):
     return floresta.words()
Пример #12
0
_DEFAULT = "English: Brown Corpus (Humor)"
_CORPORA = {
    "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
    "English: Brown Corpus": lambda: brown.words(),
    "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
        self._init_menubar()
Пример #13
0
    "English: Brown Corpus (Press)": lambda: brown.words(
        categories=["news", "editorial", "reviews"]
    ),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(
        categories="science_fiction"
    ),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
        self._init_top(self.top)
        self._init_menubar()