def __init__(self, review_data, review_key, id_key, aspect_manager):
     self.__name = 'produto'
     self.__aspect_manager = aspect_manager
     self.__data = {}
     self.__aspect_frequency = {}
     self.__tagger = nlpnet.POSTagger()
     self.__read_files(review_data, review_key, id_key)
Пример #2
0
 def __init__(self, name, opinions_path, aspect_manager):
     self.__name = name
     self.__aspect_manager = aspect_manager
     self.__data = {}
     self.__aspect_frequency = {}
     self.__tagger = nlpnet.POSTagger()
     self.__read_files(opinions_path)
Пример #3
0
def represent_doc(doc):
    tagger = nlpnet.POSTagger('./pos-pt', language='pt')
    strings = []
    for sent in doc.sents:
        if sent.text.strip():
            sentence = represent_sentence_nlpnet(sent, tagger)
            strings.append(sentence)
    return '\n'.join(strings) + '\n' if strings else ''
Пример #4
0
 def load(self, filename):
     self.filename = filename
     if self.type == 'nlpnet':
         import nlpnet
         self.pos_tagger = nlpnet.POSTagger(filename)
         self.vocabulary = dict(zip(self.pos_tagger.itd.values(), range(1, len(self.pos_tagger.itd) + 1)))
         self._save_POS(filename)
     else:
         self.pos_tagger = pickle.load(open(filename, 'rb'))
         self.vocabulary = self.pos_tagger.vocabulary()
Пример #5
0
def pos(text):
    """
    get the part-of-speech for tokens of the given sentence
    :param sentence:  [str list]
    :return: list of tuples ('tag','token')
    """
    taggerPOS = nlpnet.POSTagger()
    pos_result = taggerPOS.tag(text)

    return pos_result
Пример #6
0
    def tokenize(self, tokenize_string):
        """
        Returns the tokenized version of tokenize_string, which is just
        a normal English sentence.
        """

        # Setting up the nlpnet parser
        nlpnet.set_data_dir(self.get_data_dir_path())
        pos_parser = nlpnet.POSTagger()

        return pos_parser.tag(tokenize_string)
Пример #7
0
def load_tagger(language):
    print('LOAD TAGGER....')
    global tagger
    if (language == "en"):
        tagger = PerceptronTagger()
    elif (language == "pt-br"):
        print('tagger pt-br')

        import nlpnet
        path = os.path.dirname(__file__) + '/resources/pos-pt/'
        #print ('ciretori: %s' %path)
        tagger = nlpnet.POSTagger(path, language='pt')
        """
def query_process(texto_entrada):
    #Diretorio dos modelos de etiquetação
    data_dir = 'pos-pt'
    #Definição do diretorio e linguagem a utilizar
    tagger = nlpnet.POSTagger(data_dir, language='pt')
    tagged_str = tagger.tag(texto_entrada)
    #print(tagged_str)

    texto_consulta = []
    for i in tagged_str:
        #print(i)
        for k in i:
            # print(k)
            tags = ['NPROP', 'NUM', 'V', 'ADJ', 'N']
            entrada = k
            for i in tags:
                # print(i)
                if entrada[1] == i:
                    texto_consulta.append(entrada[0])

    return str(texto_consulta)
Пример #9
0
    def __search_representative_words(self, aspect):
        ''' Search representative words using POS tags or word frequencies '''
        tagger = nlpnet.POSTagger()

        for id_cluster, data in self.__clusters[aspect].items():
            size_cluster = len(data['sentences'])
            if size_cluster == 1:
                words = self.__sentence_list[data['sentences']
                                             [0]]['clean_text']
                words_tags = tagger.tag(" ".join(words))[0]
                representative_words = [
                    word for (word, tag) in words_tags
                    if tag == "N" or tag == "ADJ"
                ]
                self.__clusters[aspect][id_cluster][
                    'representative_words'] = representative_words
            else:
                words = []
                size_cluster /= 2

                for id_sentence in data['sentences']:
                    words += self.__sentence_list[id_sentence]['clean_text']

                frequency_words = FreqDist(words)

                for word in frequency_words.keys():
                    if frequency_words[word] > size_cluster:
                        self.__clusters[aspect][id_cluster][
                            'representative_words'].append(word)
                    else:
                        break

            self.__clusters[aspect][id_cluster][
                'representative_words'] = self.__search_top_words(
                    data['sentences'], self.__clusters[aspect][id_cluster]
                    ['representative_words'])
        self.__join_clusters(aspect)
Пример #10
0
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import itertools
import nlpnet
import os
import codecs
import utils
import re
import networkx
import json
#import matplotlib.pyplot as plt

depth_adt = -1
tokenizer = RegexpTokenizer(r'\w+')
nlpnet.set_data_dir(str("../resource//nlpnet_data/"))
tagger = nlpnet.POSTagger()


class Opizera_Summarizer(object):
    '''
    Class that implements Opizera method
    '''
    def __init__(self, name, opinions_path, aspect_manager):
        self.__name = name
        self.__aspect_manager = aspect_manager
        self.__graph = networkx.MultiDiGraph()
        self.__aspect_tuple_list = []
        self.__qualifier_list = {}
        self.__dirmoi_list = {}
        self.__moi_list = {}
        self.__cluster_list = {}
Пример #11
0
 def __init__(self):
     self.tagger = nlpnet.POSTagger(
         os.path.dirname(os.path.realpath(__file__)) + "/pos-pt",
         language='pt')
Пример #12
0
"""

from sklearn.linear_model import LinearRegression
from utils.assin_eval import read_xml, eval_similarity
from gensim.models import KeyedVectors
from xml.dom import minidom
from numpy import array
from os import path
import pickle
import argparse
import nlpnet

DATA_DIR = '../../../datasets/sentence_similarity/data/'
TEST_DIR = path.join(DATA_DIR, 'assin-test-gold/')

tagger = nlpnet.POSTagger('../../../corpora/pos-pt', language='pt')


def set_part_of_speech_tag(sent):
    sent = str(sent).replace("[", "")
    sent = sent.replace("]", "")
    sent = sent.replace("\'", "")
    sent = sent.replace(",", "")
    tags = tagger.tag(str(sent))
    tags = str(tags).replace("\', \'", "|")
    tags = tags.replace("(", "")
    tags = tags.replace(")", "")
    tags = tags.replace("[[", "")
    tags = tags.replace("]]", "")
    tags = tags.replace(",", "")
    tags = tags.replace("\'", "")
Пример #13
0
 def make_pos(self, path='./data/tweentsentbr/resources/pos-pt'):
     nlpnet.set_data_dir(path)
     self.tagger = nlpnet.POSTagger()
Пример #14
0
def convert(list):
    return tuple(i for i in list)


nltk.download('punkt')
nltk.download('stopwords')
example_sent = ''

ps = SnowballStemmer("portuguese")

st.write('Olá, cidadão!')

#example_sent = input("O que voce precisa?").lower()
example_sent = st.text_input("O que voce precisa?")
if example_sent:
    tagger = nlpnet.POSTagger('pos-pt', language='pt')

    tags = tagger.tag(example_sent)
    tags_final = []
    print('--------------')
    for item in convert(tags):
        for i in item:
            # print(i)
            if i[1] == 'N':
                #tags_final.append(ps.stem(i[0]))
                if i[0].lower() not in ("brasil" or "brasileiro"
                                        or "brasileira"):
                    if i[0].endswith('s'):
                        print("----- tirando plural")
                        stp = i[0][:-1]
                        tags_final.append(stp)
Пример #15
0
def clusterArgInicial(idtese):

    #Variaveis e funçoes para conexação com o banco de dados do Debate de Teses
    cursor = connection.cursor()
    cursor2 = connection.cursor()

    cursor.execute(
        "select distinct `usr`.`primeironome` as `name`, `arg`.`argumento` AS `posicionamentoinicial` from ((((`argumento` `arg` join `revisao` `rev`) join `replica` `rep`) join `posicionamento` `pos`) join `argumentador` `urg`)join `usuario` `usr`  where ((`arg`.`tese_idtese` = "
        + idtese +
        "  ) and (`rev`.`argumento_idargumento` = `arg`.`idargumento`) and (`rep`.`revisao_idrevisao` = `rev`.`idrevisao`) and (`arg`.`argumentador_idargumentador` = `pos`.`argumentador_idargumentador`) and (`arg`.`tese_idtese` = `pos`.`tese_idtese`) and (`arg`.`posicionamentoinicial` is not null) and (`arg`.`argumentador_idargumentador` = `urg`.`idargumentador`) and(`urg`.`usuario_idusuario` = `usr`.`idusuario`) and (`pos`.`posicionamentofinal` is not null))"
    )
    cursor2.execute("select tese from tese where idtese=" + idtese)

    #Variavel e função para tratar tags html e acentos com codificação ISO
    h = HTMLParser.HTMLParser()

    #dados retirados da consulta ao banco
    dadosSql = cursor.fetchall()
    textotese = cursor2.fetchall()

    #listas para tratar os dados iniciais
    usu = []
    posInicial = []
    dados = []
    tese = []

    #lista com dados pos tagger
    tag_posInicial = []
    tag_comAce_posInicial = []

    #lista com dados após a remoção das stopwords
    sw_tese = []
    sw_posInicial = []
    aux_usu = []
    sw_tagPosInicial = []  #texto marcado e sem stopwords
    sw_tagcomAce_posInicial = []  #texto COM ACENTOS marcado e sem stopwords

    #lista com dados após a aplicação de Stemming
    st_posInicial = []
    st_tese = []
    st_tagPosInicial = []  #texto marcado, sem stopwords e com stemmer aplicado
    st_tagcomAce_posInicial = [
    ]  #texto COM ACENTOS marcado, sem stopwords e com stemmer aplicado

    #############################################################################################################
    #LISTA COM OS POSICIONAMENTOS INICIAIS APÓS APLICAÇÃO DA NORMALIZAÇAÕ
    posInicial_Normalizado = []
    normalizacao = []

    #############################################################################################################
    #Aplicacao de Case Folding

    for d in dadosSql:
        dados.append([
            re.sub('<[^>]*>', '', h.unescape(d[0])).lower(),
            re.sub('<[^>]*>', '', h.unescape(d[1])).lower()
        ])

    for t in textotese:
        tese.append(re.sub('<[^>]*>', '', h.unescape(t[0])).lower())

    #Colocando os textos de posicionamento inicial em numa lista separada
    for i in dados:
        x = 0
        usu.append(i[x].upper())
        posInicial.append(
            i[x + 1].lower()
        )  #lista com o posicionamento Inicial com todas as letras em minusculo

#############################################################################################################
### Classificacao das palavras de acordo com sua classe gramatical
### Utilizacao do postagger NLPNET
### http://nilc.icmc.usp.br/nlpnet/index.html#

    tagger = nlpnet.POSTagger()

    semAce_posInicial = [
    ]  #armazena o posInicial apenas sem acentos, sem pontuações, sem endereço web e sem numeros
    comAce_posInicial = [
    ]  #armazena o posInicial apenas COM acentos, sem pontuações, sem endereço web e sem numeros

    for i in posInicial:
        semAce_posInicial.append(
            removePontuacao(removeA(removeNum(removeSE(removeEndWeb((i)))))))

    for i in semAce_posInicial:
        tag_posInicial.append(tagger.tag(i))

    for i in posInicial:
        comAce_posInicial.append(
            removePontuacao(removeNum(removeSE(removeEndWeb((i))))))

    for i in comAce_posInicial:
        tag_comAce_posInicial.append(tagger.tag(i))

#############################################################################################################
#APENAS PARA REALIZAR TESTE E COLOCAR NA DISSERTACAO

#     pprint(semAce_posInicial)
#     pprint(comAce_posInicial)
#     exit()

#     tagg_posInicial = []
#     for texto in posInicial:
#         tagg_posInicial.append(tagger.tag(texto))
#
#     print "posInicial"
#     pprint(posInicial)
#
#     print "tagg_posInicial"
#     pprint(tagg_posInicial)

#############################################################################################################

#############################################################################################################
### REMOCAO DE STOPWORDS
### Remocao dos termos de acordo com a NLTK
### Remocao dos termos classificados como artigos, verbos, adverbios, etc...

    for i in usu:
        aux_usu.append(removeStopWords(i))

    for i in tese:
        sw_tese.append(removeStopWords(i))

    for i in posInicial:
        sw_posInicial.append(removeStopWords(i))

    for i in tag_posInicial:
        sw_tagPosInicial.append(limpaCorpus(i))

    for i in tag_comAce_posInicial:
        sw_tagcomAce_posInicial.append(limpaCorpus(i))

####################################################################################################################################
# Aplicação do RSPL Stemmer para remoção dos afixos das palavras da lingua portuguesa
# Retirando afixos dos textos do posInicial e tese

    stemmer = RSLPStemmer()

    for i in range(len(sw_posInicial)):
        st_aux = sw_posInicial[i]
        string_aux = ""
        for sufixo in st_aux.split():
            string_aux = string_aux + " " + stemmer.stem(sufixo)

        st_posInicial.append(string_aux)

    for i in range(len(sw_tese)):
        st_aux = sw_tese[i]
        string_aux = ""
        for sufixo in st_aux.split():
            string_aux = string_aux + " " + stemmer.stem(sufixo)

        st_tese.append(string_aux)

    for i in range(len(sw_tagPosInicial)):
        termosST = ""
        auxST = []
        for j in range(len(sw_tagPosInicial[i])):
            aux = stemmer.stem(sw_tagPosInicial[i][j][0])
            etiqueta = sw_tagPosInicial[i][j][1]
            termosST = (aux, etiqueta)
            auxST.append(termosST)

        st_tagPosInicial.append(auxST)

    for i in range(len(sw_tagcomAce_posInicial)):
        termosST = ""
        auxST = []
        for j in range(len(sw_tagcomAce_posInicial[i])):
            aux = stemmer.stem(sw_tagcomAce_posInicial[i][j][0])
            etiqueta = sw_tagcomAce_posInicial[i][j][1]
            termosST = (aux, etiqueta)
            auxST.append(termosST)

        st_tagcomAce_posInicial.append(auxST)

####################################################################################################################################
### A NORMALIZACAO DE TERMOS REFERE-SE A TECNICA DE TROCAR PALAVRAS SINONIMAS, OU SEJA, QUE TENHAM SIGNIFICADO                    ##
### SEMELHANTE, POR UM UNICO TERMO REPRESENTATIVO NO CORPUS DE ANALISE. DESSA FORMA, É POSSIVEL AUMENTAR O GRAU                   ##
### DE SIMILARIDADE ENTRE OS TEXTOS ANALISADOS ATRAVES DO USO DE TECNICAS DE ANALISE ESTATISTICAS, COMO SIMILA                    ##
### RIDADE DE COSSENOS OU DISTANCIA EUCLIDIANA.                                                                                   ##
####################################################################################################################################
### A NORMALIZACAO FOI DESENVOLVIDA COM BASE NOS DADOS DISPONIBILIZADOS PELO PROJETO TEP 2.0 DO NILC/USP                          ##
### http://143.107.183.175:21480/tep2/index.htm                                                                                   ##
###                                                                                                                               ##
### FORMATO DO ARQUIVO                                                                                                            ##
### NUM1. [Tipo] {termos sinonimos} <NUM2>                                                                                        ##
### 263. [Verbo] {consentir, deixar, permitir} <973>                                                                              ##
### NUM1 = NUMERO DA LINHA DE REFERENCIA PARA TERMO SINONIMO                                                                      ##
### NUM2 = NUMERO DA LINHA DE REFERENCIA PARA TERMO ANTONIMO (SENTIDO OPOSTO)                                                     ##
####################################################################################################################################

#abre o arquivo com as relacoes de sinonimia (termos linhaWordNet) e antonimia (termos contrarios)
#arquivo apenas com termos classificados como substantivos, adjetivos e verbos
    base_tep = codecs.open(
        os.path.join(os.path.dirname(__file__), '../base_tep2/base_tep.txt'),
        'r', 'UTF8')
    #     dicionario = open('/home/panceri/git/alpes_v1/base_tep2/dicionarioSinonimos.txt', 'w')

    #variavel com conteúdo do arquivo em memoria
    #não imprimir essa variável, MUITO GRANDEE!!!
    wordNet = base_tep.readlines()

    #fechar arquivo
    base_tep.close()

    ####################################################################################################################################
    ## NORMALIZAÇÃO FEITA COM BASE NOS RADICAIS DE FORMAÇÃO DAS PALAVRAS                                                              ##
    ## APLICAÇÃO DO RSPL PRIMEIRO PARA DEPOIS BUSCAR NA BASE OS TERMOS SIMILARES                                                      ##
    ## DENTRO DA BASE_TEP OS TERMOS TAMBÉM FORAM REDUZIDOS AOS SEUS RADICIAIS DE FORMAÇÃO                                             ##
    ## O DICIONÁRIO ESTÁ COM A REFERÊNCIA PARA A LINHA AONDE ESTÃO OS TERMOS SINÔNIMOS                                                ##
    ## OS TERMOS SÃO ANALISADOS CONSIDERANDO SUAS ACENTUAÇÕES, PARA APLICAÇÃO CORRETA DO RSLP                                         ##
    ####################################################################################################################################

    yappi.set_clock_type('cpu')
    yappi.start(builtins=True)
    start = time.time()

    st_WordNetV = [
    ]  ##armazena num, tipo, e radical dos sinonimos - APENAS VERBOS
    st_WordNetN = [
    ]  ##armazena num, tipo, e radical dos sinonimos - APENAS SUBSTANTIVOS
    st_WordNetA = [
    ]  ##armazena num, tipo, e radical dos sinonimos - APENAS ADJETIVOS
    st_WordNetO = [
    ]  ##armazena num, tipo, e radical dos sinonimos - APENAS OUTROS

    for linhaWordnet in wordNet:
        listaAux = []
        termos = re.findall(r"\{(.*)\}", linhaWordnet)
        num = re.findall(r"([0-9]+)\.", linhaWordnet)
        tipo = re.findall(r"\[(.*)\]", linhaWordnet)

        if tipo[0] == "Substantivo":
            listaAux.append(num)
            listaAux.append(tipo)

            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetN.append(listaAux)

        elif tipo[0] == "Verbo":
            listaAux.append(num)
            listaAux.append(tipo)

            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetV.append(listaAux)

        elif tipo[0] == "Adjetivo":
            listaAux.append(num)
            listaAux.append(tipo)

            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetA.append(listaAux)
        else:
            listaAux.append(num)
            listaAux.append(tipo)

            for T in termos:
                aux = T.split()
                auxL = []
                for i in aux:
                    aux1 = i.replace(",", "")
                    dadosStem = stemmer.stem(aux1)
                    auxL.append(dadosStem)
                listaAux.append(auxL)
            st_WordNetO.append(listaAux)

    duration = time.time() - start
    stats = yappi.get_func_stats()
    stats.save('stemmWordNet.out', type='callgrind')

    ####################################################################################################################################
    ### A ANÁLISE É REALIZADA COM BASE NO TEXTO SEM A EXCLUSÃO DOS ACENTOS                                                            ##
    ### POIS AO EXCLUÍ-LOS A REDUÇÃO AO RADICAL DE FORMAÇÃO (APLICAÇÃO DO RSLP) É PREJUDICADA                                         ##
    ### OS TESTES REALIZADOS MOSTRARAM QUE ESSA É UMA MELHOR ABORDAGEM, UMA VEZ QUE NOSSOS TEXTOS SÃO PEQUENOS                        ##
    ### E PRECISAMOS CHEGAR O MAIS PRÓXIMO POSSÍVEL SEM CONSIDERAR SEUS SENTIDOS E/OU CONTEXTOS                                       ##
    ####################################################################################################################################
    yappi.set_clock_type('cpu')
    yappi.start(builtins=True)
    start = time.time()

    normalizacao = normalizacaoWordnet(st_WordNetA, st_WordNetN, st_WordNetV,
                                       st_WordNetO, st_tagcomAce_posInicial)

    ###############################################################
    # Colocando os textos normalizados numa lista de 1 diemensão
    ###############################################################
    stringNorm = ""
    auxNorm = []

    for i in range(len(normalizacao)):
        auxNorm = normalizacao[i]

        for x in range(len(auxNorm)):
            stringNorm = stringNorm + " " + auxNorm[x]

        posInicial_Normalizado.append(stringNorm)
        stringNorm = ""

    duration = time.time() - start
    stats = yappi.get_func_stats()
    stats.save('normalizacaoWordnet.out', type='callgrind')

    ####################################################################################################################################

    #     print "posInicial"
    #     pprint(posInicial)
    #
    #     print "comAce_posInicial"
    #     pprint(comAce_posInicial)
    #
    #     print "tag_comAce_posInicial"
    #     pprint(tag_comAce_posInicial)
    #
    #     print "sw_tagcomAce_posInicial"
    #     pprint(sw_tagcomAce_posInicial)
    #
    #     print "st_tagcomAce_posInicial"
    #     pprint(st_tagcomAce_posInicial)

    #     print "posInicial_Normalizado"
    #     print len(posInicial_Normalizado)
    #     pprint(posInicial_Normalizado)

    #     exit()
    ####################################################################################################################################

    #retorno da função - usado na views.py para alimentar o template debate.html
    #passar parametros que devem ser apresentados na templates debate.html
    return [
        st_tese, posInicial, sw_tese, aux_usu, st_posInicial, tese,
        posInicial_Normalizado
    ]
Пример #16
0
    def use_nlpnet(self, base_string, test_string, pattern_arg):
        """
        Main interface method from the NLPNET class to the rest of
        the program.
        """

        # Setting up the nlpnet parser
        nlpnet.set_data_dir(self.get_data_dir_path())
        dependency_parser = nlpnet.DependencyParser()
        pos_parser = nlpnet.POSTagger()

        # Getting the passed patterns
        patterns = pattern_arg

        # Parsing the base_string
        base_parse = dependency_parser.parse(base_string)
        base_blob = TextBlob(base_string)
        base_sentences = base_blob.sentences
        base_sentence_info = []

        for index in range(0, len(base_parse)):
            # Grabbing sentence information
            raw_data = str(base_sentences[index])
            pos_sentence = pos_parser.tag(str(base_sentences[index]))
            subject, verb, object, prepositional_phrases = self.identify_sentence_parts_nlpnet(
                base_parse[index].tokens, base_parse[index].labels)
            """
            # Displaying information for debugging purposes
            #print "***BASE***"
            #print "Raw Sentence     : " + raw_data
            #print "POS Sentence    : " + str( pos_sentence )
            #print "[ Tokens ]       : " + str( base_parse[ index ].tokens )
            #print "[ Labels ]       : " + str( base_parse[ index ].labels )
            #print "[ Subject ]     : " + subject
            #print "[ Verb ]        : " + verb
            #print "[ Object ]      : " + object
            #print "[ Prep Phrases ] : " + str( prepositional_phrases )
            """

            # Deciding whether the sentence/pattern should be added
            add_sentence = True
            for sentence in base_sentence_info:
                if sentence != []:
                    if sentence[len(sentence) - 1] == raw_data:
                        add_sentence = False

                        break

            # If the sentence should be added to the possible patterns, add it
            if add_sentence:
                base_sentence_info.append(
                    [subject, verb, object, [], raw_data])

        # Parsing the test_string
        test_parse = dependency_parser.parse(test_string)
        test_blob = TextBlob(test_string)
        test_sentences = test_blob.sentences
        test_sentence_info = []

        for index in range(0, len(test_parse)):
            # Grabbing sentence information
            raw_data = str(test_sentences[index])
            pos_sentence = pos_parser.tag(str(test_sentences[index]))
            subject, verb, object, prepositional_phrases = self.identify_sentence_parts_nlpnet(
                test_parse[index].tokens, test_parse[index].labels)
            """
            #print "***TEST***"
            #print "Raw Sentence     : " + raw_data
            #print "POS Sentence    : " + str( pos_sentence )
            #print "[ Tokens ]       : " + str( test_parse[ index ].tokens )
            #print "[ Labels ]       : " + str( test_parse[ index ].labels )
            #print "[ Subject ]     : " + subject
            #print "[ Verb ]        : " + verb
            #print "[ Object ]      : " + object
            #print "[ Prep Phrases ] : " + str( prepositional_phrases )
            """

            # Deciding whether the sentence/pattern should be added
            add_sentence = True
            for sentence in test_sentence_info:
                if sentence != []:
                    if sentence[len(sentence) - 1] == raw_data:
                        add_sentence = False

                        break

            # If the sentence should be added to the possible patterns, add it
            if add_sentence:
                test_sentence_info.append(
                    [subject, verb, object, [], raw_data])

        # Returning the patterns found in the text
        return self.identify_common_patterns(base_sentence_info,
                                             test_sentence_info, patterns)
Пример #17
0
    def __init__(self, nlpnet_model_dir=''):

        if nlpnet_model_dir != '':
            nlpnet.set_data_dir(nlpnet_model_dir)
            self.tagger = nlpnet.POSTagger()
Пример #18
0
    def load_tagger(self):
        if not self._data_dir:
            self._data_dir = config['NLPNET_DATA_DIR']

        nlpnet.set_data_dir(self._data_dir)
        self._tagger = nlpnet.POSTagger()
Пример #19
0
import nltk
import unidecode
from nltk.tokenize import TweetTokenizer
import nlpnet

nltk.download('averaged_perceptron_tagger')

tknzr = TweetTokenizer()

text = "CARTAO DE PULSO TELEFONICO (FIXO) CEDULA DE DINHEIRO NACIONAL Celular CRLV PENEIRA  Substância:COCAINA TESOURA  Veículo:HONDA BIZ 125 ES Placa:HSU2058  Veículo:VOLKSWAGEN GOL 1.6 RALLYE Placa:NRN0843"
unaccented_string = unidecode.unidecode(text)
tokenizado = tknzr.tokenize(unaccented_string)
print(tokenizado)

#textoParaPos = word_tokenize(unaccented_string)
tageado = nltk.pos_tag(tokenizado)
print('tagueado', tageado)

tagger = nlpnet.POSTagger('C:/Users/mateu/AppData/Roaming/nltk_data',
                          language='pt')
tagger.tag(tokenizado)
Пример #20
0
nlpnet.set_data_dir(CONFIG.get('attributes', 'setdatadir'))

TEXT = ''
METHOD = ''

try:
    OPTS, ARGS = getopt.getopt(sys.argv[1:], "ht:m:", ["text=", "method="])
except getopt.GetoptError:
    sys.exit(1)
for opt, arg in OPTS:
    if opt == '-h':
        print 'nlpnet2go.py -t <"text to be analyzed"> -m <method [''pos''] OR [''srl'']>'
        print 'Eg.: python nlpnet2go.py -t "teste do edward" -m pos'
        sys.exit()
    elif opt in ("-t", "--text"):
        TEXT = arg
    elif opt in ("-m", "--method"):
        METHOD = arg


if METHOD == "pos":
    TAGGER = nlpnet.POSTagger()
    print TAGGER.tag(TEXT)
elif METHOD == "srl":
    TAGGER = nlpnet.SRLTagger()
    SENT = TAGGER.tag(TEXT)[0]
    print SENT.arg_structures
else:
    print sys.argv[1:], "Invalid Tagger method operator. Only 'pos' OR 'srl' allowed."
# print sys.argv[1:]
Пример #21
0
def call_nlpnet(snt):
    tagger = nlpnet.POSTagger(ROOT+'/model_nlpnet/', language='pt')
    return tagger.tag(snt)[0]