def __init__(self):
     tsents = mac_morpho.tagged_sents()
     tsents = [[(w.lower(), t) for (w, t) in sent] for sent in tsents
               if sent]
     tagger0 = nltk.DefaultTagger('N')
     tagger1 = nltk.UnigramTagger(tsents[100:], backoff=tagger0)
     self.tagger = nltk.BigramTagger(tsents[100:], backoff=tagger1)
 def tag_unigrams_by_topic(self, dict_of_sentences_by_topic):
     tagged_unigrams_by_topic = {}
     train_sents = mac_morpho.tagged_sents()[:5000]
     tagger = UnigramTagger(train_sents)
     for k, v in dict_of_sentences_by_topic.items():
         tagged_unigrams_by_topic[k] = tagger.batch_tag(dict_of_sentences_by_topic[k])
     return tagged_unigrams_by_topic
Exemplo n.º 3
0
    def __preparar_tagger(self):

        nome_arquivo_tagger = './cache/postagger.pickle'

        if os.path.exists(nome_arquivo_tagger):
            logging.debug("Carregando o Pos-Tagger já treinado de " +
                          nome_arquivo_tagger)
            with open(nome_arquivo_tagger, 'rb') as arquivo:
                self.tagger = pickle.load(arquivo)

        else:
            logging.debug("Treinando o Pos-Tagger.")
            #tsents = floresta.tagged_sents()
            tsents = mac_morpho.tagged_sents()
            tsents = [[(w.lower(), self.__simplify_tag(t)) for (w, t) in sent]
                      for sent in tsents if sent]
            tagger0 = nltk.DefaultTagger('n')
            tagger1 = nltk.UnigramTagger(tsents, backoff=tagger0)
            tagger2 = nltk.BigramTagger(tsents, backoff=tagger1)
            #tagger3 = nltk.PerceptronTagger(tsents)
            self.tagger = tagger2

            logging.debug("Gravando o Pos-Tagger treinado em " +
                          nome_arquivo_tagger)
            with open(nome_arquivo_tagger, 'wb') as arquivo:
                pickle.dump(self.tagger, arquivo)
Exemplo n.º 4
0
def accuracy_measure():
    basicConfig(format='%(levelname)s %(message)s', level=INFO)
    info('reading tagged sentences')
    info('simplifying tags')

    # tagged sentences
    flo_tsents = simplified_sents_floresta(floresta.tagged_sents())
    mac_tsents = mac_morpho.tagged_sents()

    # FLORESTA test and train data
    flo_size = int(len(flo_tsents) * 0.9)
    flo_train = flo_tsents[:flo_size]
    flo_test = flo_tsents[flo_size:]

    # MAC MORPHO test and train data
    mac_size = int(len(mac_tsents) * 0.9)
    mac_train = mac_tsents[:mac_size]
    mac_test = mac_tsents[mac_size:]

    no_backoff_taggers(flo_test, flo_train)
    no_backoff_taggers(mac_test, mac_train, corpus='macmorpho')

    if not pt.check_for_taggers():
        save = True
    else:
        save = False

    backoff_taggers(flo_test, flo_train, save)
    backoff_taggers(mac_test, mac_train, save, corpus='macmorpho')
 def tag_unigrams_by_topic(self, dict_of_sentences_by_topic):
     tagged_unigrams_by_topic = {}
     train_sents = mac_morpho.tagged_sents()[:5000]
     tagger = UnigramTagger(train_sents)
     for k, v in dict_of_sentences_by_topic.items():
         tagged_unigrams_by_topic[k] = tagger.batch_tag(
             dict_of_sentences_by_topic[k])
     return tagged_unigrams_by_topic
Exemplo n.º 6
0
    def run(self, corpus=Corpus.FLORESTA, force=False):
        self.should_force = force

        if corpus == Corpus.FLORESTA:
            print("\n##### Floresta Corpus #####")
            floresta_sent = floresta.tagged_sents()
            self.train("floresta", floresta_sent)
        elif corpus == Corpus.MAC_MORPHO:
            print("\n###### Mac Morpho Corpus #####")
            mac_morpho_sent = mac_morpho.tagged_sents()
            self.train("mac_morpho", mac_morpho_sent)
Exemplo n.º 7
0
def get_unigram_tagger():
    #using mac morpho corpus to train tagger
    #Use unigram tagging (maybe other ones in the future)
    p_train = 0.9
    print "Training unigram tagger using mac mopho corpus... %.2f train" % (
        p_train)
    tagged_sents = mac_morpho.tagged_sents()
    size = int(len(tagged_sents) * 0.9)
    train_sents = tagged_sents[:size]
    test_sents = tagged_sents[size:]
    uni_tagger = nltk.UnigramTagger(train_sents)
    print "Test accuracy =", uni_tagger.evaluate(test_sents)
    return uni_tagger
 def create_a_dict_model_for_test_accuracy(self, tagged_unigrams_by_topic):
     pre_model = {k: map(dict, v) for k, v in tagged_unigrams_by_topic.items()}
     for k, v in pre_model.items():
         reference_model_by_topic = {}
         for i in v:
             reference_model_by_topic.update(i)
         pre_model[k] = reference_model_by_topic
     dict_model_by_topic = pre_model
     test_sents = mac_morpho.tagged_sents()[:5000]
     tagger_accuracy_by_topic = {}
     for k, v in pre_model.items():
         tagger_accuracy_by_topic[k] = UnigramTagger(model=pre_model[k]).evaluate(test_sents)
     return dict_model_by_topic, tagger_accuracy_by_topic
def tagging(documents):
    nDocs = len(documents)
    documentsProcessed = []
    unigram_tagger = []
    try:        
        unigram_tagger = file_utils.load_object('tagger1','tagger', None)
    except:
        train_set =  mac_morpho.tagged_sents() # Treinamento do tagger via dicionário mac_morpho
        unigram_tagger = nltk.UnigramTagger(train_set) # Aplicação do tagger ao documento
        file_utils.save_object(unigram_tagger, 'tagger1','tagger', None) # Salva os unigramas tagueados a serem dicionados aos documentos processados 
    for iDoc in range(0,nDocs): # Ordenação dos diferentes documentos a serem processados
        documentsProcessed.append(unigram_tagger.tag(documents[iDoc])) # Adiciona à lista a partir da ordenação
    return documentsProcessed
Exemplo n.º 10
0
def train_tagger():
    tagged_sents = mac_morpho.tagged_sents()
    tagged_sents = [[(w, simplify_tag(t)) for (w, t) in sent]
                    for sent in tagged_sents if sent]
    tagger0 = nltk.DefaultTagger("N")
    tagger1 = nltk.UnigramTagger(tagged_sents, backoff=tagger0)
    tagger2 = nltk.BigramTagger(tagged_sents, backoff=tagger1)

    output = open("tagger.pkl", "wb")
    pickle.dump(tagger2, output, -1)
    output.close()

    return tagger2
Exemplo n.º 11
0
def train_tagger():
    '''
    Train a tagger on the mac_morpho tagged corpus (must be
    available locally) and save it for future use. A total
    of 4 taggers are trained to allow backing of (1) and
    guarantee an assignment to every term.
    (1) 3-gram -> 2-gram -> 1-gram -> N (noun)
    '''
    tagged_sents = mac_morpho.tagged_sents()

    tagger0 = tag.DefaultTagger('N')
    tagger1 = tag.UnigramTagger(tagged_sents, backoff=tagger0)
    tagger2 = tag.BigramTagger(tagged_sents, backoff=tagger1)
    tagger3 = tag.NgramTagger(3, tagged_sents, backoff=tagger2)

    save_tagger(tagger3)
Exemplo n.º 12
0
 def create_a_dict_model_for_test_accuracy(self, tagged_unigrams_by_topic):
     pre_model = {
         k: map(dict, v)
         for k, v in tagged_unigrams_by_topic.items()
     }
     for k, v in pre_model.items():
         reference_model_by_topic = {}
         for i in v:
             reference_model_by_topic.update(i)
         pre_model[k] = reference_model_by_topic
     dict_model_by_topic = pre_model
     test_sents = mac_morpho.tagged_sents()[:5000]
     tagger_accuracy_by_topic = {}
     for k, v in pre_model.items():
         tagger_accuracy_by_topic[k] = UnigramTagger(
             model=pre_model[k]).evaluate(test_sents)
     return dict_model_by_topic, tagger_accuracy_by_topic
Exemplo n.º 13
0
def tagging(documents):
    nDocs = len(documents)
    #    print nDocs
    documentsProcessed = []
    unigram_tagger = []
    try:
        unigram_tagger = file_utils.load_object('tagger1', 'tagger', None)
        print unigram_tagger
    except:
        train_set = mac_morpho.tagged_sents()
        #test_set =  mac_morpho.tagged_sents()[10001:10010]
        unigram_tagger = nltk.UnigramTagger(train_set)
        file_utils.save_object(unigram_tagger, 'tagger1', 'tagger', None)

    for iDoc in range(0, nDocs):
        #tokens = documents[iDoc]
        documentsProcessed.append(unigram_tagger.tag(documents[iDoc]))
    return documentsProcessed
Exemplo n.º 14
0
def filtra_palavras(palavras):
    # Verifica se já existe o etiquetador treinado
    if os.path.exists("etiquetador.bin"):  # Se sim: apenas o carrega
        etiquetador = pickle.load(open("etiquetador.bin", "rb"))
    else:  # Se não: treina ele e o salva
        palavras_treinar = mac_morpho.tagged_sents()
        etiquetador = nltk.tag.UnigramTagger(palavras_treinar)
        pickle.dump(etiquetador, open("etiquetador.bin", "wb"))

    palavras_etiquetadas = etiquetador.tag(palavras)

    palavras_filtradas = []
    for palavra in palavras_etiquetadas:
        if palavra[0] not in stopwords.words('portuguese'):
            if not (palavra[1] == 'KS' or palavra[1] == 'KC' or palavra[1] == 'ART' or palavra[1] == 'PREP'):
                palavras_filtradas.append(palavra[0])

    return palavras_filtradas
Exemplo n.º 15
0
def tagging(documents):
    nDocs = len(documents)
    documentsProcessed = []
    unigram_tagger = []
    try:
        unigram_tagger = file_utils.load_object('tagger1', 'tagger', None)
    except:
        train_set = mac_morpho.tagged_sents(
        )  # Treinamento do tagger via dicionário mac_morpho
        unigram_tagger = nltk.UnigramTagger(
            train_set)  # Aplicação do tagger ao documento
        file_utils.save_object(
            unigram_tagger, 'tagger1', 'tagger', None
        )  # Salva os unigramas tagueados a serem dicionados aos documentos processados
    for iDoc in range(
            0,
            nDocs):  # Ordenação dos diferentes documentos a serem processados
        documentsProcessed.append(unigram_tagger.tag(
            documents[iDoc]))  # Adiciona à lista a partir da ordenação
    return documentsProcessed
Exemplo n.º 16
0
def get_pos_tagger(model_path, lang='pt'):
    if os.path.isfile(model_path):
        logger.info("Loading POS tagger at %s" % model_path)
        with open(model_path, 'rb') as f:
            pos_tagger = pickle.load(f)
    else:
        if lang == 'pt':
            logger.info("Training and saving portuguese POS tagger to %s" % model_path)
            tagged_sentences = mac_morpho.tagged_sents()
            tagged_sentences = [[(w, t) for (w, t) in s] for s in tagged_sentences if s]
            train = tagged_sentences
            tagger_default = nltk.DefaultTagger('N')
            tagger_unigram = nltk.UnigramTagger(train, backoff=tagger_default)
            pos_tagger = nltk.BigramTagger(train, backoff=tagger_unigram)
            with open(model_path, "wb") as f:
                pickle.dump(pos_tagger, f)
        else:
            logger.warning("Using default english POS tagger for '%s'" % lang)
            pos_tagger = EnglishPOSTagger()

    return pos_tagger
Exemplo n.º 17
0
    def tagging(self, documents, savePath, language):
        nDocs = len(documents)
        documentsProcessed = []
        unigram_tagger = []
        from data_core.file_utils import FileUtils
        file_utils = FileUtils(savePath)
        try:
            unigram_tagger = file_utils.load_object('tagger_' + language,
                                                    'tagger')
        except:
            if language == "pt":
                train_set = mac_morpho.tagged_sents()
            elif language == "en":
                train_set = brown.tagged_sents(tagset='universal')
                #print(train_set[0:1])
                nSents = len(train_set)
                train_set_lower = []
                for iSent in range(0, nSents):
                    nWords = len(train_set[iSent])
                    words = []
                    for iWord in range(0, nWords):
                        words.append(
                            (self.text_lower_one([train_set[iSent][iWord][0]
                                                  ])[0],
                             train_set[iSent][iWord][1]))

                    train_set_lower.append(words)

            #print(train_set_lower[0:1])
            #test_set =  mac_morpho.tagged_sents()[10001:10010]
            unigram_tagger = nltk.UnigramTagger(train_set_lower)
            file_utils.save_object(unigram_tagger, 'tagger_' + language,
                                   'tagger')

        for iDoc in range(0, nDocs):
            #tokens = documents[iDoc]
            documentsProcessed.append(unigram_tagger.tag(documents[iDoc]))
        return documentsProcessed
Exemplo n.º 18
0
def train_tagger():
    '''
    Um exemplo de treinamento de um etiquetador sintático usando
    um modelo de tri-gramas baseado em probabilidades.

    Um etiquetador sintático identifica quais a classe de uma palavra
    Ex.: Isso é um teste = Isso-PROSUB é-V um-ART teste-N
    Preposição Verbo Artigo Substantivo
    '''

    # Carregando um conjunto de dados em português que possui
    # sentenças manualmente identificadas
    data = [
        [(w, re.split('[|-]', tag)[0]) for w, tag in sent]
        for sent in mac_morpho.tagged_sents()]

    # Classe sintática padrão. N siginifica Nome/substantivo
    tagger0 = DefaultTagger('N')
    print('train unigram')
    tagger1 = UnigramTagger(data, backoff=tagger0)
    print('training bigram')
    tagger2 = BigramTagger(data, backoff=tagger1)
    print('training trigram')
    return TrigramTagger(data, backoff=tagger2)
Exemplo n.º 19
0
    ),
    "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="simple"),
    "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"),
    "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
    "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"),
    "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
    "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="simple"),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
    "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="simple"),
    "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
    "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="simple"),
    "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
    "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="simple"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
    "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="simple"),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="simple"),
    "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="simple"),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = "#FFF"  # white

    # Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR = "#F00"  # red
    _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"

    _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0"  # dark grey
    _HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG"
Exemplo n.º 20
0
def TaggerOnline(tokens):
	etiq1 = DefaultTagger('N')
	sentencas_treinadoras = mac_morpho.tagged_sents()[::]
	etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1)
	tagsTokens = etiq2.tag(tokens)
	return tagsTokens
Exemplo n.º 21
0
                myPhrases.append(word)
	#Faz o processo ser repetido recursivamente para que o padrão(phrase)
	#seja procurado em cada um dos filhos da árvore e concatena o resultado
	#se este for favorável
        for child in myTree:
                if (type(child) is Tree):
                        list_of_phrases = ExtractPhrases(child, phrase)
                        if (len(list_of_phrases) > 0):
                                myPhrases.extend(list_of_phrases)
	#Retorna a lista de padrões encontrados
        return myPhrases

#Cria o etiquetador padrão para que palavras não conhecidas sejam tratadas com substantivo(N)
etiqPadrao = DefaultTagger('N')
#Pega o trainning set a partir das tagged_sents() do mac_morpho
sentencas_treinadoras = mac_morpho.tagged_sents()[0:15000]
#Cria o UnigramTagger com base no etiquetador padrão e treina-o com as sentenças etiquetadas do mac_morpho
etiq = UnigramTagger(sentencas_treinadoras, backoff=etiqPadrao)

coment = str(input("Entre com o texto: "))
if coment == "default":
        coment = open("default.txt", "r").read().replace("\n", " ")
#O texto é convertido em tokens
tokens=nltk.word_tokenize(coment.lower())
#É etiquetada cada token do texto
tags = etiq.tag(tokens)

#É criado o analisador de expresões regulares contendo os padrões procurados
analiseGramatical = RegexpParser(r"""
		PADRAO7: {<N><ADJ>}
        PADRAO1: {<ADJ><N>(<PREP>?<N>)*}
Exemplo n.º 22
0
    'Chinese: Sinica Corpus (simplified)':
    lambda: sinica_treebank.tagged_sents(tagset='universal'),
    'Dutch: Alpino Corpus':
    lambda: alpino.tagged_sents(),
    'Dutch: Alpino Corpus (simplified)':
    lambda: alpino.tagged_sents(tagset='universal'),
    'Hindi: Indian Languages Corpus':
    lambda: indian.tagged_sents(files='hindi.pos'),
    'Hindi: Indian Languages Corpus (simplified)':
    lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.tagged_sents(),
    'Portuguese: Floresta Corpus (Portugal, simplified)':
    lambda: floresta.tagged_sents(tagset='universal'),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.tagged_sents(),
    'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
    lambda: mac_morpho.tagged_sents(tagset='universal'),
    'Spanish: CESS-ESP Corpus (simplified)':
    lambda: cess_esp.tagged_sents(tagset='universal'),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = '#FFF'  #white

    #Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR = '#F00'  #red
    _HIGHLIGHT_WORD_TAG = 'HL_WRD_TAG'

    _HIGHLIGHT_LABEL_COLOUR = '#C0C0C0'  # dark grey
Exemplo n.º 23
0
ilusao_re = re.compile(
    r"(^|[ (\[{“'])(d+-?e+-?s+-?)?i+-?l+-?u+-?s+-?[aeiouà-ü]*-?o+-?([- .!?,;\)\]\}”'_:]|$)",
    re.MULTILINE | re.IGNORECASE)
chateacao_re = re.compile(
    r"(^|[ (\[{“'])c+-?h+-?a+-?t+-?e+-?a+-?[cç]+-?[aeiouà-ü]*-?o+-?([- .!?,;\)\]\}”'_:]|$)",
    re.MULTILINE | re.IGNORECASE)
pizza_re = re.compile(
    r"(^|[ (\[{“'])p+-?i+-?(z-?)+a+-?([- .!?,;\)\]\}”'_:]|$)",
    re.MULTILINE | re.IGNORECASE)
chope_re = re.compile(
    r"(^|[ (\[{“'])c+-?h+-?o+-?p+-?e*([- .!?,;\)\]\}”'_:]|$)",
    re.MULTILINE | re.IGNORECASE)
garcom_re = re.compile(r"(^|[ (\[{“'])gar[sç]+o[nm]([- .!?,;\)\]\}”'_:]|$)",
                       re.MULTILINE | re.IGNORECASE)

tsents = mac_morpho.tagged_sents()
tagger0 = nltk.DefaultTagger('N')
tagger1 = nltk.UnigramTagger(tsents, backoff=tagger0)
tagger2 = nltk.BigramTagger(tsents, backoff=tagger1)

o = ""
with open('c:\\dev\\estagio\\analise.txt', encoding='UTF-8') as f:
    o = f.readlines()

dados = []
for line in o:
    while (link_re.search(line)):
        line = re.sub(link_re, '. ', line)

    while (data_re.search(line)):
        line = re.sub(data_re, ' (data) ', line)
Exemplo n.º 24
0
            'Chinese: Sinica Corpus (simplified)':
                lambda: sinica_treebank.tagged_sents(simplify_tags=True),
            'Dutch: Alpino Corpus':
                lambda: alpino.tagged_sents(),
            'Dutch: Alpino Corpus (simplified)':
                lambda: alpino.tagged_sents(simplify_tags=True),
            'Hindi: Indian Languages Corpus':
                lambda: indian.tagged_sents(files='hindi.pos'),
            'Hindi: Indian Languages Corpus (simplified)':
                lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.tagged_sents(),
            'Portuguese: Floresta Corpus (Portugal, simplified)':
                lambda: floresta.tagged_sents(simplify_tags=True),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.tagged_sents(),
            'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
                lambda: mac_morpho.tagged_sents(simplify_tags=True),
            'Spanish: CESS-ESP Corpus (simplified)':
                lambda: cess_esp.tagged_sents(simplify_tags=True),
           }

class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR='#FFF' #white

    #Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR='#F00' #red
    _HIGHLIGHT_WORD_TAG='HL_WRD_TAG'

    _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey
    _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG'
Exemplo n.º 25
0
    'Chinese: Sinica Corpus (simplified)':
    lambda: sinica_treebank.tagged_sents(simplify_tags=True),
    'Dutch: Alpino Corpus':
    lambda: alpino.tagged_sents(),
    'Dutch: Alpino Corpus (simplified)':
    lambda: alpino.tagged_sents(simplify_tags=True),
    'Hindi: Indian Languages Corpus':
    lambda: indian.tagged_sents(files='hindi.pos'),
    'Hindi: Indian Languages Corpus (simplified)':
    lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.tagged_sents(),
    'Portuguese: Floresta Corpus (Portugal, simplified)':
    lambda: floresta.tagged_sents(simplify_tags=True),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.tagged_sents(),
    'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
    lambda: mac_morpho.tagged_sents(simplify_tags=True),
    'Spanish: CESS-ESP Corpus (simplified)':
    lambda: cess_esp.tagged_sents(simplify_tags=True),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = '#FFF'  #white

    #Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR = '#F00'  #red
    _HIGHLIGHT_WORD_TAG = 'HL_WRD_TAG'

    _HIGHLIGHT_LABEL_COLOUR = '#C0C0C0'  # dark grey
Exemplo n.º 26
0
    textoBrownTagSentNew.append(sentNew)

#print('segundo',textoBrownTagSentNew)
fdTag2 = nltk.FreqDist(tag for m in textoBrownTagSentNew for (word, tag) in m)
print('tags2 TAG', fdTag2.most_common())

fdTag3 = nltk.FreqDist(word for m in textoBrownTagSentNew for (word, tag) in m)
#print('tags2 WORD',fdTag3.most_common())

tagTexto(textoBrownTagSentNew)
''' 
1.Estender	o	exemplo	 dos	etiquetadores	para	
TrigramTagger e	analisar	a	precisao	do	modelo	
'''

treino = mac_morpho.tagged_sents()[1000:]
teste = mac_morpho.tagged_sents()[:1000]
etiq0 = nltk.DefaultTagger('N')
etiq1 = nltk.UnigramTagger(treino, backoff=etiq0)
print('UnigramTagger', etiq1.evaluate(teste))
etiq2 = nltk.BigramTagger(treino, backoff=etiq1)
print('BigramTagger', etiq2.evaluate(teste))
etiq3 = nltk.TrigramTagger(treino, backoff=etiq2)
print('TrigramTagger', etiq3.evaluate(teste))

doc = open('textoPT.txt', encoding='utf8')
raw = doc.read()

#texto = nltk.word_tokenize('O  mundo atual possui diversos idiomas.')
texto = nltk.word_tokenize(raw)
#print('etiq2', etiq2.tag(texto))
Exemplo n.º 27
0
    def tagging(self, documents, savePath, language):
        nDocs = len(documents)
        # documentsProcessed = []
        unigram_tagger = []
        train_set_lower = []

        # def simplify_tag(t):
        #     if "+" in t:
        #         return t[t.index("+")+1:]
        #     else:
        #         return t

        # tsents = floresta.tagged_sents()
        # tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent]
        # train = tsents[100:]

        # from data_core.file_utils import FileUtils
        # file_utils = FileUtils(savePath)

        # print documents

        try:
            unigram_tagger = file_utils.load_object('tagger_' + language,
                                                    'tagger')
        except:
            if language == "pt":
                train_set = mac_morpho.tagged_sents()
                # train_set = train;

            # elif language == "en":
            #     train_set = brown.tagged_sents(tagset='universal')
            #     #print(train_set[0:1])
            #     nSents = len(train_set)
            #     train_set_lower = []
            #     for iSent in range(0,nSents):
            #         nWords = len(train_set[iSent])
            #         words = []
            #         for iWord in range(0,nWords):
            #             words.append((self.text_lower_one([train_set[iSent][iWord][0]])[0],train_set[iSent][iWord][1]))

            #         train_set_lower.append(words)

        #     # print(train_set_lower[0:1])
        #     test_set =  mac_morpho.tagged_sents()[10001:10010]

        #     print 'antes do mac morpho'
        #    tagger = nltk.UnigramTagger(train_set)

            tagger0 = nltk.DefaultTagger('n')
            tagger1 = nltk.UnigramTagger(train_set, backoff=tagger0)
            tagger2 = nltk.BigramTagger(train_set, backoff=tagger1)

            # tagger = nltk.BigramTagger(train_set)
        #     print 'depois do mac morpho'

        #     string = unigram_tagger, 'tagger_' + language,'tagger'
        #     print string
        # #     # file_utils.save_object(unigram_tagger, 'tagger_' + language,'tagger')
        #     print unigram_tagger.tag(documents[0])

        for iDoc in range(0, nDocs):
            print tagger2.tag(documents[iDoc])

        # print tagger.tag(documents[0])

        return ''
Exemplo n.º 28
0
 def initialize_dataset(self):
     tagged_sentences = mac_morpho.tagged_sents()
     self.train = tagged_sentences[100:]
     self.test = tagged_sentences[:100]
Exemplo n.º 29
0
    "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
        tagset="universal"
    ),
    "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
    "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
        tagset="universal"
    ),
    "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
    "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(
        files="hindi.pos", tagset="universal"
    ),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
    "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(
        tagset="universal"
    ),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(
        tagset="universal"
    ),
    "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(
        tagset="universal"
    ),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = "#FFF"  # white

    # Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR = "#F00"  # red
    _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"
Exemplo n.º 30
0
def Tagger():
	#Tagger
	etiq1 = DefaultTagger('N')
	sentencas_treinadoras = mac_morpho.tagged_sents()[::]
	etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1)
	return etiq2
Exemplo n.º 31
0
    'Chinese: Sinica Corpus (simplified)':
    lambda: sinica_treebank.tagged_sents(tagset='simple'),
    'Dutch: Alpino Corpus':
    lambda: alpino.tagged_sents(),
    'Dutch: Alpino Corpus (simplified)':
    lambda: alpino.tagged_sents(tagset='simple'),
    'Hindi: Indian Languages Corpus':
    lambda: indian.tagged_sents(files='hindi.pos'),
    'Hindi: Indian Languages Corpus (simplified)':
    lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.tagged_sents(),
    'Portuguese: Floresta Corpus (Portugal, simplified)':
    lambda: floresta.tagged_sents(tagset='simple'),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.tagged_sents(),
    'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
    lambda: mac_morpho.tagged_sents(tagset='simple'),
    'Spanish: CESS-ESP Corpus (simplified)':
    lambda: cess_esp.tagged_sents(tagset='simple'),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = '#FFF'  #white

    #Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR = '#F00'  #red
    _HIGHLIGHT_WORD_TAG = 'HL_WRD_TAG'

    _HIGHLIGHT_LABEL_COLOUR = '#C0C0C0'  # dark grey
Exemplo n.º 32
0
def pos_concordance(word):
    for sent_pos in mac_morpho.tagged_sents():
        sent = [s for s in sent_pos if word in s[0]]
Exemplo n.º 33
0
			#print("TESTE",pos_simple)
			candidates_simple = set(itertools.product(*pos_simple))
			candidates_med = set(itertools.product(*pos_med))
			candidates_full = set(itertools.product(*pos_full))
		else:
			candidates_simple = candidates_simple.intersection(set(itertools.product(*pos_simple)))
			candidates_med = candidates_med.intersection(set(itertools.product(*pos_med)))
			candidates_full = candidates_full.intersection(set(itertools.product(*pos_full)))
		#print("ITERTOOLS")
		#print(candidates_simple)
	return candidates_simple, candidates_med, candidates_full

sentences = [s[1] for s in inputs]

log.info("Loading Mac-Morpho Tagged Sents...")
tsents = list(mac_morpho.tagged_sents())


def simplify_tag(t):
	if "+" in t:
		t = t[t.index("+")+1:]
	
	if t == "ART":
		return "DET"
	
	return t

log.info("Simplifyng POS Tags...")
tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent]

train = tsents
Exemplo n.º 34
0
    },
    'handlers': {
        'mail_admins': {
            'level': 'ERROR',
            'filters': ['require_debug_false'],
            'class': 'django.utils.log.AdminEmailHandler'
        }
    },
    'loggers': {
        'django.request': {
            'handlers': ['mail_admins'],
            'level': 'ERROR',
            'propagate': True,
        },
    }
}

import nltk
NLTK_DATAPATH = os.path.join(os.path.dirname(nltk.__file__), "data")
nltk.data.path = [NLTK_DATAPATH]
ETIQUETADOR = None

try:
    from nltk.tag import UnigramTagger
    from nltk.corpus import mac_morpho
    sentencas_treinadoras = mac_morpho.tagged_sents()[0:100]
    ETIQUETADOR = UnigramTagger(sentencas_treinadoras)
except LookupError:
    pass

Exemplo n.º 35
0
def Tagger():
    #Tagger
    etiq1 = DefaultTagger('N')
    sentencas_treinadoras = mac_morpho.tagged_sents()[::]
    etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1)
    return etiq2
Exemplo n.º 36
0
def TaggerOnline(tokens):
    etiq1 = DefaultTagger('N')
    sentencas_treinadoras = mac_morpho.tagged_sents()[::]
    etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1)
    tagsTokens = etiq2.tag(tokens)
    return tagsTokens
Exemplo n.º 37
0
            'Chinese: Sinica Corpus (simplified)':
                lambda: sinica_treebank.tagged_sents(tagset='simple'),
            'Dutch: Alpino Corpus':
                lambda: alpino.tagged_sents(),
            'Dutch: Alpino Corpus (simplified)':
                lambda: alpino.tagged_sents(tagset='simple'),
            'Hindi: Indian Languages Corpus':
                lambda: indian.tagged_sents(files='hindi.pos'),
            'Hindi: Indian Languages Corpus (simplified)':
                lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.tagged_sents(),
            'Portuguese: Floresta Corpus (Portugal, simplified)':
                lambda: floresta.tagged_sents(tagset='simple'),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.tagged_sents(),
            'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
                lambda: mac_morpho.tagged_sents(tagset='simple'),
            'Spanish: CESS-ESP Corpus (simplified)':
                lambda: cess_esp.tagged_sents(tagset='simple'),
           }

class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR='#FFF' #white

    #Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR='#F00' #red
    _HIGHLIGHT_WORD_TAG='HL_WRD_TAG'

    _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey
    _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG'
Exemplo n.º 38
0
	def __init__(self):
		tsents = mac_morpho.tagged_sents()
		tsents = [[(w.lower(),t) for (w,t) in sent] for sent in tsents if sent]
		tagger0 = nltk.DefaultTagger('N')
		tagger1 = nltk.UnigramTagger(tsents[100:], backoff=tagger0)
		self.tagger = nltk.BigramTagger(tsents[100:], backoff=tagger1)		
Exemplo n.º 39
0
            'Chinese: Sinica Corpus (simplified)':
                lambda: sinica_treebank.tagged_sents(tagset='universal'),
            'Dutch: Alpino Corpus':
                lambda: alpino.tagged_sents(),
            'Dutch: Alpino Corpus (simplified)':
                lambda: alpino.tagged_sents(tagset='universal'),
            'Hindi: Indian Languages Corpus':
                lambda: indian.tagged_sents(files='hindi.pos'),
            'Hindi: Indian Languages Corpus (simplified)':
                lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.tagged_sents(),
            'Portuguese: Floresta Corpus (Portugal, simplified)':
                lambda: floresta.tagged_sents(tagset='universal'),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.tagged_sents(),
            'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
                lambda: mac_morpho.tagged_sents(tagset='universal'),
            'Spanish: CESS-ESP Corpus (simplified)':
                lambda: cess_esp.tagged_sents(tagset='universal'),
           }

class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR='#FFF' #white

    #Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR='#F00' #red
    _HIGHLIGHT_WORD_TAG='HL_WRD_TAG'

    _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey
    _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG'
Exemplo n.º 40
0
# pylint: disable=C0111
# pylint: disable=C0103
from nltk.corpus import mac_morpho
import nltk

tags = [t for s in mac_morpho.tagged_sents() for (p, t) in s]
frequencia = nltk.FreqDist(tags)

print(frequencia.most_common(5))
Exemplo n.º 41
0
    "Chinese: Sinica Corpus (simplified)":
    lambda: sinica_treebank.tagged_sents(tagset="universal"),
    "Dutch: Alpino Corpus":
    lambda: alpino.tagged_sents(),
    "Dutch: Alpino Corpus (simplified)":
    lambda: alpino.tagged_sents(tagset="universal"),
    "Hindi: Indian Languages Corpus":
    lambda: indian.tagged_sents(files="hindi.pos"),
    "Hindi: Indian Languages Corpus (simplified)":
    lambda: indian.tagged_sents(files="hindi.pos", tagset="universal"),
    "Portuguese: Floresta Corpus (Portugal)":
    lambda: floresta.tagged_sents(),
    "Portuguese: Floresta Corpus (Portugal, simplified)":
    lambda: floresta.tagged_sents(tagset="universal"),
    "Portuguese: MAC-MORPHO Corpus (Brazil)":
    lambda: mac_morpho.tagged_sents(),
    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)":
    lambda: mac_morpho.tagged_sents(tagset="universal"),
    "Spanish: CESS-ESP Corpus (simplified)":
    lambda: cess_esp.tagged_sents(tagset="universal"),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = "#FFF"  # white

    # Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR = "#F00"  # red
    _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"

    _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0"  # dark grey
Exemplo n.º 42
0
        else:
            candidates_simple = candidates_simple.intersection(
                set(itertools.product(*pos_simple)))
            candidates_med = candidates_med.intersection(
                set(itertools.product(*pos_med)))
            candidates_full = candidates_full.intersection(
                set(itertools.product(*pos_full)))
        #print("ITERTOOLS")
        #print(candidates_simple)
    return candidates_simple, candidates_med, candidates_full


sentences = [s[1] for s in inputs]

log.info("Loading Mac-Morpho Tagged Sents...")
tsents = list(mac_morpho.tagged_sents())


def simplify_tag(t):
    if "+" in t:
        t = t[t.index("+") + 1:]

    if t == "ART":
        return "DET"

    return t


log.info("Simplifyng POS Tags...")
tsents = [[(w.lower(), simplify_tag(t)) for (w, t) in sent] for sent in tsents
          if sent]