Пример #1
0
    def filter_pos_infinitive(self, s, category_list=[], allowed=False):
            '''
              Filters grammatical categories (pos:Part-of-Speech tags) from a string
              and converts to infinitive, predicative and singularized forms words:

              If allowed is set to True it only allows POS in category_list.
              If allowed is set to False it allows all POS except those in category_list

              POS that can be in category list: 
              nouns        = ['NN', 'NNS', 'NNP', 'NNPS']
              verbs        = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
              adjectives   = ['JJ','JJR','JJS']
              determiners  = ['DT']
              conjunctions = ['IN', 'CC']
              adverbs      = ['RB','RBR', 'RBS']
              modals       = ['MD']
              utterances   = ['UH']

              In:
                  (s:string, category_list:list of strings, allowed:boolean)
              Out:
                  (string)
            '''
            if isinstance(s, str):
                s = unicode(s, "utf-8", "xmlcharrefreplace")
            list = []
            pos_list = self.pos_tagging(s)
            if len(category_list) == 0:
                return s
            if allowed == False:
                for pos in pos_list:
                    if pos.split(':')[1] not in category_list:
                        if pos.split(':')[1] in ['NNS']:
                            word = singularize(pos.split(':')[0])  
                            list.append(word)
                        elif pos.split(':')[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
                            word = conjugate(pos.split(':')[0], INFINITIVE)  
                            list.append(word)
                        elif pos.split(':')[1] in ['JJ','JJR','JJS']:
                            word = predicative(pos.split(':')[0])  
                            list.append(word)
                        else:
                            list.append(pos.split(':')[0])
            else:
                for pos in pos_list:
                    if pos.split(':')[1] in category_list:
                        if pos.split(':')[1] in ['NNS']:
                            word = singularize(pos.split(':')[0])  
                            list.append(word)
                        elif pos.split(':')[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
                            word = conjugate(pos.split(':')[0], INFINITIVE)  
                            list.append(word)
                        elif pos.split(':')[1] in ['JJ','JJR','JJS']:
                            word = predicative(pos.split(':')[0])  
                            list.append(word)
                        else:
                            list.append(pos.split(':')[0])
            return u' '.join(list)
    def pos_tagging_infinitive(self, s):
            '''
              Grammatical category of each word a.k.a. Part-of-Speech (pos) tagging,
              but transformming adjectives to predicative form, singularizing nouns and
              verbs to infinitive form

              ej. ella:PRP maneja:VBD carros:NNS rojos:JJ
                    PRP: Possesive pronoun  ---> ella
                    VBD: Verb in past tense ----> manejar(infinitive)
                    NNS: Noun in plural --------> carro (singularized)
                    JJ: adjective --------------> rojo (predicative)
              In:
                    (s:string) string text               
              Out:
                    (list) list with grammatical categories in the form 'word:category'
            '''
            categories = parse(s)
            list = []
            if isinstance(s, str):
                s = unicode(s, "utf-8", "xmlcharrefreplace")
            for x in categories.split():
                for y in x:
                    if y[1] in ['NNS']:
                        word = singularize(y[0])  
                        list.append(word+":NN")
                    elif y[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
                        word = conjugate(y[0], INFINITIVE)  
                        list.append(word+":VB")
                    elif y[1] in ['JJ','JJR','JJS']:
                        word = predicative(y[0])  
                        list.append(word+":JJ")
                    else:
                        list.append(y[0]+':'+y[1])
            return list
Пример #3
0
def convertir(
    cambiar
):  #Diccionario con {'s':[palabrassingulares],'p':[palabrasplurales]}
    """ Devuelve un diccionario con palabras singulares en plurales y viceversa """
    invertido = {'p': [], 's': []}
    for singulares in cambiar['s']:
        invertido['p'].append(pluralize(singulares))
    for plurales in cambiar['p']:
        invertido['s'].append(singularize(plurales))
    return invertido
Пример #4
0
def parse_NP(words):
    number="s"
    noun=""
    gender='m'

    t=Word_list_to_Text(words)

    # Example: todos los perros
    m=pattern_match("{DT} {DT} {JJ*|NN*}", t)
    if m and len(m)==len(t.words):
        learn_gender(m.group(2)[0].string, noun)
        noun = singularize(m.group(3)[0].string)
        if m.group(2)[0].string in plural_words:
            number='p'
        if m.group(2)[0].string in female_words:
            gender='f'
        return number, gender, noun

    # Example: el perro
    m=pattern_match("{DT} {JJ*|NN*}", t)
    if m and len(m)==len(t.words):
        noun = singularize(m.group(2)[0].string)
        learn_gender(m.group(1)[0].string, noun)
        if m.group(1)[0].string in plural_words:
            number='p'
        if m.group(1)[0].string in female_words:
            gender='f'
        return number, gender, noun

    # Example: verde
    m=pattern_match("{JJ*|NN*}", t)
    if m and len(m)==len(t.words):
        noun=m.group(1)[0].string
        if noun==pluralize(noun):
            number="p"
        noun = singularize(noun)
        
        # TODO: gender     

        return number, gender, noun

    print "parse_NP() : not found", t
    sys.exit(0)
Пример #5
0
def pword(text):
    ind = parse(text).split('/')[1][0]

    # verbo
    if ind == 'V':
        word = lemma(text)

    # sustantivo o adjetivo
    else:
        word = singularize(text)
    return word
Пример #6
0
def convertir(dicc):
    dicc_nuevo = {}
    for x in dicc:
        if x == 's':
            dicc_nuevo['p'] = []
            for i in range(len(dicc[x])):
                dicc_nuevo['p'].append(pluralize(dicc[x][i]))
        if x == 'p':
            dicc_nuevo['s'] = []
            for i in range(len(dicc[x])):
                dicc_nuevo['s'].append(singularize(dicc[x][i]))
    return dicc_nuevo
Пример #7
0
def convertir(dic):
    diccionario = {}
    for clave in dic:
        lista = dic[clave]
        listaAux = []
        if (clave == 's'):
            for elem in lista:
                listaAux.append(pluralize(elem))
            diccionario['p'] = listaAux
        else:
            for elem in lista:
                listaAux.append(singularize(elem))
            diccionario['s'] = listaAux
    return diccionario
Пример #8
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     test = {}
     for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")):
         if tag == "n": test.setdefault(lemma, []).append(w)
     i, n = 0, 0
     for sg, pl in test.items():
         pl = sorted(pl, key=len, reverse=True)[0]
         if es.singularize(pl) == sg:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.93)
     print("pattern.es.singularize()")
Пример #9
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     test = {}
     for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")):
         if tag == "n": test.setdefault(lemma, []).append(w)
     i, n = 0, 0
     for sg, pl in test.items():
         pl = sorted(pl, key=len, reverse=True)[0]
         if es.singularize(pl) == sg:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.93)
     print "pattern.es.singularize()"
Пример #10
0
def modifica_linea(dialogo):
    p = parse(dialogo)
    lista = p.split(' ')
    linea = []
    for i in lista:
        palabra = i.split('/')
        if palabra[1] == 'VB':
            p = conjugate(palabra[0], INFINITIVE)
            linea.append(p)
        elif palabra[1] == 'NN':
            linea.append(singularize(palabra[0]))
        else:
            linea.append(palabra[0])
    l = " ".join(linea)
    return l
Пример #11
0
	def tagLemma(self, word_old):
		#print tag(word_old)
		for word, pos in tag(word_old): 
			if pos=="NNS": #plurales
				x = singularize(word)
			elif pos in ["VB","VBG","VBZ","VBP","VBD","VBN","MD"]: # verbos a infinitivo 
				x = conjugate(word, INFINITIVE)
				#To-Do: fix this
				if x: # a veces da error al conjugar
					x = x
				else:
					x = word
			else:
				x = word  
		return x
Пример #12
0
 def unify_tokens(self):
     """
     Singuralizes nouns, conjugates verbs to infinitive and passes adjectives to
     predicative form in tokens
     :return: Tokens
     """
     if self._analysis is None:
         raise Exception('It\'s necessary execute first analize')
     for i in range(len(self._tokens)):
         if self._analysis[i][1][0] == 'n':
             self._tokens[i] = singularize(self._tokens[i])
         elif self._analysis[i][1][0] == 'v':
             self._tokens[i] = conjugate(self._tokens[i], INFINITIVE)
         elif self._analysis[i][1][0] == 'a':
             self._tokens[i] = predicative(self._tokens[i])
     return self._tokens
Пример #13
0
 def unify_tokens(self):
     """
     Singuralizes nouns, conjugates verbs to infinitive and passes adjectives to
     predicative form in tokens
     :return: Tokens
     """
     if self._analysis is None:
         raise Exception('It\'s necessary execute first analize')
     for i in range(len(self._tokens)):
         if self._analysis[i][1][0] == 'n':
             self._tokens[i] = singularize(self._tokens[i])
         elif self._analysis[i][1][0] == 'v':
             self._tokens[i] = conjugate(self._tokens[i], INFINITIVE)
         elif self._analysis[i][1][0] == 'a':
             self._tokens[i] = predicative(self._tokens[i])
     return self._tokens
	def stemming(self,tokens):
		text = " ".join(tokens)
		words = []
		part_of_speech = {}
		part_of_speech['noun'] = ["NN"]
		part_of_speech['verbs'] = ["VB","VBG","VBP","VBZ","VBN","VBD"]
		part_of_speech['plural'] = ["NNS"]
		part_of_speech['adjective'] = ["JJ"]
		for word, pos in tag(text):
			if pos in part_of_speech['noun']:
				word = self.stemmer.stemming(word)
			if pos in part_of_speech["verbs"]:			
				word = lemma(word)
			if pos in part_of_speech['plural']:
				word = singularize(word)
			if pos in part_of_speech['adjective']:
				word = self.stemmer.stemming(word)
			words.append(word)
		return words
Пример #15
0
def _getSingularize(word, language):
    import pattern.en as pattern_en  # @UnresolvedImport
    import pattern.es as pattern_es  # @UnresolvedImport
    import pattern.fr as pattern_fr  # @UnresolvedImport
    import pattern.de as pattern_de  # @UnresolvedImport
    import pattern.it as pattern_it  # @UnresolvedImport

    if language == "es":
        return pattern_es.singularize(word)
    elif language == "en":
        return pattern_en.singularize(word)
    elif language == "it":
        return pattern_it.singularize(word)
    elif language == "fr":
        return pattern_fr.singularize(word)
    elif language == "de":
        return pattern_de.singularize(word)
    else:
        return pattern_en.singularize(word)
Пример #16
0
def convertir_corto(cambiar):
    dicc = {
        'p': list(map(lambda x: pluralize(x), cambiar['s'])),
        's': list(map(lambda x: singularize(x), cambiar['p']))
    }
    return dicc  #por que no anda si hago return {...}??????
Пример #17
0
def pluralize_singularize():
    print(pluralize('gato'))
    print(singularize('gatos'))
Пример #18
0
def validacion(teclado, diccionario):
    """ Esta funcion debe ser llamada dentro de un loop para ingresar todas las palabras. Puntualmente
	la funcion valida 1a palabra ingresada y la agrega a un diccionario de palabras validas """
    web = Wiktionary(language="es")
    articulo = web.search(teclado)  #PALABRA
    cambio = False
    try:
        s = list(filter(lambda x: x.title == "Español", articulo.sections))
        etimologia = list(
            filter(lambda x: x.title == "Etimología", s[0].children))
        if etimologia == []:
            teclado = singularize(teclado)
            cambio = True  #booleano que indica si la palabra cambió de plural a singular
            validado = False
            articulo = web.search(teclado)
        if articulo is not None:
            try:
                s = list(
                    filter(lambda x: x.title == "Español", articulo.sections))
                etimologia = list(
                    filter(lambda x: x.title == "Etimología", s[0].children))
                definicion = etimologia[0].content  #DEFINICION
                lista = [
                    "Adjetivo", "Verbo", "Verbo intransitivo", "Forma verbal",
                    "Verbo transitivo", "Forma adjetiva", "Sustantivo",
                    "Sustantivo masculino", "Sustantivo femenino"
                ]
                lista_verbos = [
                    "Verbo", "Verbo intransitivo", "Forma verbal",
                    "Verbo transitivo"
                ]
                lista_sustantivos = [
                    "Sustantivo", "Sustantivo masculino", "Sustantivo femenino"
                ]
                lista_adjetivos = ["Adjetivo", "Forma Adjetiva"]
                for tipo in lista:
                    tipo_real = list(
                        filter(lambda x: x.title == tipo, s[0].children))
                    if tipo_real:
                        clasificacion = tipo
                        break
                if clasificacion in lista_adjetivos:
                    clasificacion = "Adjetivo"
                elif clasificacion in lista_sustantivos:
                    clasificacion = "Sustantivo"
                elif clasificacion in lista_verbos:
                    clasificacion = "Verbo"
                validado = True
            except IndexError:  #esto previene el error de que la pagina de wik no tenga etimologia (definicion)
                validado = False
                definicion = ""  # lo declaro para evitar un error posterior de referenciar una variable antes de que tenga un valor
                clasificacion = ""
    except AttributeError:
        validado = False
        definicion = ""  # lo declaro para evitar un error posterior de referenciar una variable antes de que tenga un valor
        clasificacion = ""
    if cambio:
        teclado = pluralize(teclado)

    indice = teclado
    diccionario[indice] = {
        "Definición": definicion,
        "Tipo": clasificacion
    }  #diccionario con la palabra , su deficinicion y su tipo.
    if validado:
        datos = {
            "info_palabra": diccionario,
            "validez": True
        }  #si se ingreso correctamente, se modifica la lista y diccionario
    else:
        datos = {
            "info_palabra": diccionario,
            "validez": False
        }  #si no se ingresó, solo "sirve" la validez
    return datos
def buscar(palabra, dic):
    engine = wik(language='es')
    articulo = None
    for i in range(0, 2):  #3 reconexciones, una cada 1 segundos
        sg.PopupAnimated('loading.gif', alpha_channel=0.5)
        try:
            articulo = engine.article(singularize(palabra))
        except:
            time.sleep(0.1)
        else:
            if articulo != None and engine.article(palabra).sections[
                    1].title == 'Español':  #si esta en wiktionary
                #y es una palabra en español (por que puede encontrar palabras en otro idioma)
                try:
                    seccion = articulo.sections[3].content
                    tipo = parsear_tipo(seccion)
                    descripcion = parsear_descripcion(seccion)
                    dic[palabra] = {'tipo': tipo, 'descripcion': descripcion}
                    reportar(palabra + ' está en wiktionary.')
                    sg.PopupAnimated(image_source=None)
                    return dic
                except:  #si esta en wiktionary pero no pudo parsear la definicion y el tipo...
                    sg.PopupAnimated(image_source=None)
                    if onPattern(palabra):
                        tipo = clasificar(
                            singularize(palabra))  #saca el tipo de pattern
                        if not esValido(tipo):
                            tipo = agregarTipo()
                            if not tipo:
                                return False
                        descripcion = agregarDescripcion()
                        if not descripcion:
                            return False
                        dic[palabra] = {
                            'tipo': tipo,
                            'descripcion': descripcion
                        }
                        reportar(palabra + ' está en pattern.')
                        return dic
                    else:
                        tipo = agregarTipo()
                        if not tipo:
                            return False
                        descripcion = agregarDescripcion()
                        if not descripcion:
                            return False
                        dic[palabra] = {
                            'tipo': tipo,
                            'descripcion': descripcion
                        }
                        reportar(palabra +
                                 ' no está ni en wiktionary o pattern.')
                        return dic
            elif onPattern(palabra):  #si fue None se pureba si esta en pattern
                sg.PopupAnimated(image_source=None)
                tipo = clasificar(palabra)
                if not esValido(tipo):
                    tipo = agregarTipo()
                    if not tipo:
                        return False
                descripcion = agregarDescripcion()
                if not descripcion:
                    return False
                reportar(palabra +
                         ' con tipo y descripción generada por el usuario.')
                dic[palabra] = {'tipo': tipo, 'descripción': descripcion}
                return dic
            else:
                sg.PopupAnimated(image_source=None)
                reportar(palabra + ' no está en wiktionary ni en pattern.')
                return False
Пример #20
0
# Encoding = UTF-8
from pattern.es import singularize, pluralize

print(singularize('caballos'))


def cambiapalabras(diccionario):
    """Recibe un diccionario con dos keys: ’s’ y ’p’. Donde ’s’ indica que la lista asociada contiene palabras en singular
    y ’p’ indica que la lista asociada contiene palabras en plural. Devuelve un diccionario con as palabras cambiadas
    de singular a plurar y viceversa"""
    cambiado = {}
    for key in diccionario:
        if key == 's':
            cambiado[key] = list(map(pluralize, diccionario.get(key)))
        elif key == 'p':
            cambiado[key] = list(map(singularize, diccionario.get(key)))

    return cambiado


cambiar = {
    's': ['gato', 'caballo', 'silla'],
    'p': ['informaticas', 'psicologas', 'ingenieras']
}

print(cambiapalabras(cambiar))
# devuelve:
# {'p': ['informatica', 'psicologa', 'ingeniera'], 's': ['gatos', 'caballos', 'sillas']}