Пример #1
0
 def test_singleton(self):
     excpt_thrown = False
     try:
         Cogroo()
     except TypeError, e:
         excpt_thrown = True
         self.assertEquals(Cogroo.MSG_CALL_INSTANCE, '%s' % e)
Пример #2
0
def test_lemmatize():
    cogroo = Cogroo.Instance()
    phrase_to_lemmatize = 'o entendimento das metas propostas oferece uma interessante oportunidade para verificação ' \
                          'do impacto na agilidade decisória '
    expected_result = 'o entender de o meta propor oferecer um interessante oportunidade para verificação de o ' \
                      'impacto em o agilidade decisório'
    assert expected_result == cogroo.lemmatize(phrase_to_lemmatize)
class CogrooSemanticizer:
    cogroo = Cogroo.Instance()
    entities_list = []

    def __init__(self, text):
        self.input_text = text
        self.pos_tagged_text = self.cogroo.analyze(text).sentences[0]

    def get_entities(self):
        self.entities_list = self.search_chunks()
        agglutinator = Agglutinator.Agglutinator(self.input_text,
                                                 self.entities_list)
        self.entities_list = agglutinator.agglutinate()
        self.clean_entities()

        print("\n", "-" * 20, "> CogrooSemanticizer")
        print(self.pos_tagged_text.chunks)
        for entity in self.entities_list:
            print(entity)

        return self.entities_list

    def search_chunks(self):
        for chunk in self.pos_tagged_text.chunks:
            if chunk.tag in n_tags or chunk.tag in v_tags:
                self.entities_list = self.filter_chunk(chunk)
        return self.entities_list

    def filter_chunk(self, chunk):
        '''
        Retira expressões das frases
        :param chunk:
        :param self.entities_list:
        :return: entities_list:
        '''
        for token in chunk.tokens:

            if token.pos in n_tags:
                entity = ec.Entity(text=token.lexeme,
                                   start=token.start,
                                   end=token.end,
                                   tag='NP',
                                   pos=token.pos)
                self.entities_list.append(entity)

            if token.pos in v_tags and token.lexeme == 'vem':
                entity = ec.Entity(text=token.lexeme,
                                   start=token.start,
                                   end=token.end,
                                   tag='VP',
                                   pos=token.pos)
                self.entities_list.append(entity)

        return self.entities_list

    def clean_entities(self):
        for entity in self.entities_list:
            text = entity.text.replace('_', ' ')
            entity.text = text
Пример #4
0
def main(name):

    file = open(name, mode='r')

    cogroo = Cogroo.Instance()
    sentences = []

    for idx, line in enumerate(file):

        aux = line.strip().split("\t")

        frase = aux[0]
        sentence = aux[0].replace("'", '"').replace(":", "#").replace(
            ";", "$").replace(".", "%")
        en1 = aux[2].replace("=", "_").replace(".", "%")
        en2 = aux[5].replace("=", "_").replace(".", "%")

        if int(aux[1].split(",")[0]) > int(aux[4].split(",")[0]):

            small = en2
            big = en1

        else:

            small = en1
            big = en2

        sentence = sentence.replace(en1, "en1").replace(en2, "en2")

        aux_features = []
        pos = ""
        check_en = 0
        aux_lexeme = []

        analyzer = cogroo.analyze(sentence).sentences[0].tokens
        words = []

        for idx, t in enumerate(analyzer):

            if check_en == 0 and (t.lexeme == "en1" or t.lexeme == "en2"):

                check_en = 1
                aux_lexeme.append(t.lexeme)
                t.lemma = small.replace("%", ".")
                t.lexeme = small.replace("%", ".")
                continue

            elif check_en == 1 and (t.lexeme == "en1" or t.lexeme == "en2"):

                if not t.lexeme == aux_lexeme[0]:

                    t.lemma = big.replace("%", ".")
                    t.lexeme = big.replace("%", ".")
                    break

            elif check_en == 0:
                continue

            aux = t.synchunk if len(t.synchunk) <= 2 else t.synchunk[2:]

            pos += " " + t.pos
            vector = [t.lemma, t.pos, t.chunk[2:], aux]

            vector.append("1") if t.pos[0] == "v" else vector.append("0")
            vector.append("1") if t.pos == "adv" else vector.append("0")

            # POS, Lemma and Syntactic Tags (-2, -1, 1, 2)

            gap(analyzer, vector, idx, -2)
            gap(analyzer, vector, idx, -1)
            gap(analyzer, vector, idx, 1)
            gap(analyzer, vector, idx, 2)

            consecutive(analyzer, vector, idx, -2, -1)
            consecutive(analyzer, vector, idx, -1, 0)
            consecutive(analyzer, vector, idx, 0, 1)
            consecutive(analyzer, vector, idx, 1, 2)

            # Pattern based features

            try:
                vector.append("1") if analyzer[
                    idx - 2].pos[0] == "v" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append("1") if analyzer[
                    idx - 1].pos[0] == "v" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append("1") if analyzer[
                    idx + 1].pos[0] == "v" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append("1") if analyzer[
                    idx + 2].pos[0] == "v" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append("1") if analyzer[
                    idx +
                    1].pos == "prp" and t.pos[0] == "v" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append("1") if analyzer[
                    idx +
                    1].pos == "art" and t.pos[0] == "v" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append(
                    "1") if analyzer[idx + 2].pos == "art" and analyzer[
                        idx +
                        1].pos == "prp" and t.pos[0] == "v" else vector.append(
                            "0")
            except:
                vector.append("0")

            try:
                vector.append("1") if analyzer[
                    idx +
                    1].pos == "prp" and t.pos == "n" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append("1") if analyzer[
                    idx +
                    1].pos == "prp" and t.pos == "adv" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append(
                    "1") if analyzer[idx + 2].pos == "art" and analyzer[
                        idx +
                        1].pos == "prp" and t.pos == "adv" else vector.append(
                            "0")
            except:
                vector.append("0")

            # Syntactic features

            # Núcleo do sintagma

            if not t.lexeme in ("o", "a", "os", "as") and t.pos in (
                    "n", "prop", "pron-det", "pron-pers",
                    "pron-indp") and "NP" in t.chunk:
                vector.append("1")
            else:
                vector.append("0")

            # Objeto Direto

            if aux == "ACC": vector.append("1")
            else: vector.append("0")

            vector.insert(0, t.lexeme)

            aux = line.strip().split("\t")

            if aux[3] == "PER" or aux[6] == "PER": vector.append("O-PER")
            elif aux[3] == "PLC" or aux[6] == "PLC": vector.append("O-PLC")
            else: vector.append("O-O")

            aux_features.append(vector)

        for x in aux_features:

            x.append(pos[1:])
            x.append(str(len(pos[1:].split(" "))))

            words.append(x)

        en1 = en1.replace("%", ".")
        en2 = en2.replace("%", ".")
        sentences.append([words, frase, en1, en2])

    # SALVA OS VETORES DE FEATURES

    print(sentences)

    with gzip.open('4_RelP/features_teste.txt.gz', 'wb') as f:
        pickle.dump(sentences, f)
Пример #5
0
import csv
from cogroo_interface import Cogroo

cogroo = Cogroo.Instance()
classe = []
resposta = []
lema_list = []
analise = []
classes_pergs = []
i=0

with open('corpus.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    next(readCSV)
    for row in readCSV:
        classe.append(row[3])
        resposta.append(row[2])
        pergunta_lema = str(cogroo.lemmatize(row[1]))
        pergunta_lema = pergunta_lema.lower() #Passa todas as palavras para lowercase
        lema_list.append( str(pergunta_lema).split(' ') ) #adiciona as perguntas lematizadas em uma lista
        #analise_perg = cogroo.analyze(pergunta_lema)
        #analise.append(analise_perg.sentences[0].tokens) #guarda a analise morfologica das perguntas em uma lista
        
        y=0
        while(y<len(lema_list[i])):
            perg = str(lema_list[i][y])
            if(perg=='o' or perg=='de' or perg==',' or perg=='que' or perg=='qual' or perg=='a' or perg=='um' or perg=='.' or perg=='o('
                    or perg=='?' or perg=='' or perg=='ser' or perg=='quem' or perg=='em' or perg=='por' or perg=='algoritmo'
                    or perg=='ir' or perg=='se' or perg=='random_forest' or perg=='random' or perg=='forest') or perg=='para':
                lema_list[i].pop(y)
                y=y-1
Пример #6
0
 def setUp(self):
     self.cogroo = Cogroo.instance()
Пример #7
0
 def __init__(self, plan):
     self.cogroo = Cogroo.Instance()
     self.plan = plan
Пример #8
0
def main(name):

    file = open(name, "r")

    cogroo = Cogroo.Instance()
    sentences = []

    for idx, line in enumerate(file):

        aux = line.strip().split("\t")
        '''print("Frase original -",aux[0])
        print("EN1 -", aux[2])
        print("EN2 -",aux[7])'''

        # PRÉ-PROCESSAMENTO

        frase = aux[0]
        sentence = aux[0].replace("'", '"').replace(":", "#").replace(
            ";", "$").replace(".", "%")
        en1 = aux[2].replace("=", "_").replace(".", "%")
        rel_num = aux[4].split(",")
        rel = aux[5].split(" ")
        en2 = aux[7].replace("=", "_").replace(".", "%")

        if rel_num[0] == 'None': rel_start = 0
        else: rel_start = int(rel_num[0])

        if int(aux[1].split(",")[0]) > int(aux[6].split(",")[0]):

            small = en2
            big = en1

        else:

            small = en1
            big = en2

        sentence = sentence.replace(en1, "en1").replace(en2, "en2")
        '''print("Frase modificada -",sentence, "\n")
        ajuda = "Sem relação" if rel == [''] else " ".join(rel)
        print(ajuda, "\n")'''

        aux_rel = []
        aux_features = []
        aux_lexeme = []
        pos = ""
        check_en = 0
        count_rel = 0

        analyzer = cogroo.analyze(sentence).sentences[0].tokens
        words = []

        for idx, t in enumerate(analyzer):

            if check_en == 0 and (t.lexeme == "en1" or t.lexeme == "en2"):

                check_en = 1
                aux_lexeme.append(t.lexeme)
                t.lemma = small.replace("%", ".")
                t.lexeme = small.replace("%", ".")
                continue

            elif check_en == 1 and (t.lexeme == "en1" or t.lexeme == "en2"):

                if not t.lexeme == aux_lexeme[0]:

                    t.lemma = big.replace("%", ".")
                    t.lexeme = big.replace("%", ".")
                    break

            elif check_en == 0:
                continue

            # CLASSE

            if count_rel == len(rel): classification = 0

            elif t.lexeme == rel[count_rel]:

                classification = 1
                count_rel += 1

            else:
                classification = 0

            # FEATURES

            aux = t.synchunk if len(t.synchunk) <= 2 else t.synchunk[2:]

            pos += " " + t.pos
            vector = [t.lemma, t.pos, t.chunk[2:], aux]

            vector.append("1") if t.pos[0] == "v" else vector.append("0")
            vector.append("1") if t.pos == "adv" else vector.append("0")

            # POS, Lemma and Syntactic Tags (-2, -1, 1, 2)

            gap(analyzer, vector, idx, -2)
            gap(analyzer, vector, idx, -1)
            gap(analyzer, vector, idx, 1)
            gap(analyzer, vector, idx, 2)

            consecutive(analyzer, vector, idx, -2, -1)
            consecutive(analyzer, vector, idx, -1, 0)
            consecutive(analyzer, vector, idx, 0, 1)
            consecutive(analyzer, vector, idx, 1, 2)

            # Pattern based features

            try:
                vector.append("1") if analyzer[
                    idx - 2].pos[0] == "v" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append("1") if analyzer[
                    idx - 1].pos[0] == "v" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append("1") if analyzer[
                    idx + 1].pos[0] == "v" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append("1") if analyzer[
                    idx + 2].pos[0] == "v" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append("1") if analyzer[
                    idx +
                    1].pos == "prp" and t.pos[0] == "v" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append("1") if analyzer[
                    idx +
                    1].pos == "art" and t.pos[0] == "v" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append(
                    "1") if analyzer[idx + 2].pos == "art" and analyzer[
                        idx +
                        1].pos == "prp" and t.pos[0] == "v" else vector.append(
                            "0")
            except:
                vector.append("0")

            try:
                vector.append("1") if analyzer[
                    idx +
                    1].pos == "prp" and t.pos == "n" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append("1") if analyzer[
                    idx +
                    1].pos == "prp" and t.pos == "adv" else vector.append("0")
            except:
                vector.append("0")

            try:
                vector.append(
                    "1") if analyzer[idx + 2].pos == "art" and analyzer[
                        idx +
                        1].pos == "prp" and t.pos == "adv" else vector.append(
                            "0")
            except:
                vector.append("0")

            # Syntactic features

            # Núcleo do sintagma

            if not t.lexeme in ("o", "a", "os", "as") and t.pos in (
                    "n", "prop", "pron-det", "pron-pers",
                    "pron-indp") and "NP" in t.chunk:
                vector.append("1")
            else:
                vector.append("0")

            # Objeto Direto

            if aux == "ACC": vector.append("1")
            else: vector.append("0")

            vector.insert(0, t.lexeme)
            vector.insert(0, str(classification))

            aux = line.strip().split("\t")

            if aux[3] == "PER" or aux[8] == "PER": vector.append("O-PER")
            elif aux[3] == "PLC" or aux[8] == "PLC": vector.append("O-PLC")
            else: vector.append("O-O")

            aux_features.append(vector)
            '''print(classification, t.lexeme)'''

        for x in aux_features:

            x.append(pos[1:])
            x.append(str(len(pos[1:].split(" "))))

            words.append(x)
        '''print("\n")'''

        en1 = en1.replace("%", ".")
        en2 = en2.replace("%", ".")
        sentences.append([words, frase, en1, en2])

    # SALVA OS VETORES DE FEATURES

    with gzip.open('features_treino.txt.gz', 'wb') as f:
        pickle.dump(sentences, f)
Пример #9
0
 def __init__(self):
     self.cogroo = Cogroo.Instance()