예제 #1
0
def only_stems(keywords):
    st = PorterStemmer()
    os = OrengoStemmer()
    ss = SavoyStemmer()

    rs = RSLPStemmer()
    
    stem1 = [st.getWordStem(x.encode('utf8')) for x in keywords]
    stem2 = [rs.stem(x.encode('utf8')) for x in keywords]
    stem3 = [os.getWordStem(x.encode('utf8')) for x in keywords]
    stem4 = [ss.getWordStem(x.encode('utf8')) for x in keywords]

    return stem1+stem2+stem3+stem4
예제 #2
0
def stem(caller, word):
	global _orengostemmer

	lang = getattr(caller, "lang", "en")
	if lang == "en":
		return porter2.stem(word)
	elif lang == "pt":
		if _orengostemmer is None:
			from ptstemmer.implementations.OrengoStemmer import OrengoStemmer
			_orengostemmer = OrengoStemmer()
		return _orengostemmer.getWordStem(word)
	else:
		return word
예제 #3
0
    def __setup(self):

        utl = Utils()
        stemmer = OrengoStemmer()
        stemmer.enableCaching(1000)

        input_file = open("Data_sets/"+self.__file_name+".csv", "r")
        annotation = []
        annotation_int = []

        for line in input_file:
            vec = line.split(';')
            annotation.append(vec[1].replace('\n',''))
            annotation_int.append(int(vec[1].replace('\n','')))
            vec[0] = vec[0].lower()
            vec[0] = utl.remove_marks(vec[0])
            vec[0] = utl.replace_mentions(vec[0])
            vec[0] = utl.delete_links(vec[0])
            phrase = ''
            for elem in vec[0].split(' '):
                if(self.__lang == 'en'):
                    elem = stem(elem)
                if(self.__lang == 'pt'):
                    elem = stemmer(elem)
                phrase = phrase+' '+elem
            self.__corpus.append(phrase.replace('\n',''))

        self.__number_of_examples = len(self.__corpus)
        transform = self.__vectorizer.fit_transform(self.__corpus)
        feature_list = self.__vectorizer.get_feature_names()

        transform_binary = self.__vectorizer_binary.fit_transform(self.__corpus)

        self.__feature_vector_len = len(feature_list)
        self.__X = self.__vectorizer.transform(self.__corpus)
        self.__X_binary = self.__vectorizer_binary.transform(self.__corpus).toarray().tolist()
        self.__y = annotation
        self.__y_int = annotation_int
        self.__set_of_classes = set(annotation)
예제 #4
0
    def _getStemmerObject(self, language="pt-br", approach="orengo"):

        if (approach != self.stemmerType):

            self.stemmerType = approach
            if approach == "orengo":
                self.stemmer = OrengoStemmer()

            if approach == "porter":
                self.stemmer = PorterStemmer()

            if approach == "savoy":
                self.stemmer = SavoyStemmer()

        return self.stemmer
예제 #5
0
# -*- coding: LATIN-1 -*-
'''
 * PTStemmer - A Stemming toolkit for the Portuguese language (C) 2008-2010 Pedro Oliveira
 * 
 * This file is part of PTStemmer.
 * PTStemmer is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * PTStemmer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with PTStemmer. If not, see <http://www.gnu.org/licenses/>.
'''
from ptstemmer.implementations.OrengoStemmer import OrengoStemmer
from ptstemmer.implementations.SavoyStemmer import SavoyStemmer
from ptstemmer.implementations.PorterStemmer import PorterStemmer
from ptstemmer.support import PTStemmerUtilities

if __name__ == '__main__':
    s = OrengoStemmer()  #or PorterStemmer or SavoyStemmer
    s.enableCaching(1000)
    s.ignore(PTStemmerUtilities.fileToSet(""))
    stem = s.getWordStem("ciências")
    print(PTStemmerUtilities.removeDiacritics(stem))
    print(s.getWordStem("extremamente"))
예제 #6
0
import os
import json
from flask import Flask, jsonify, request
from ptstemmer.implementations.OrengoStemmer import OrengoStemmer
from ptstemmer.implementations.SavoyStemmer import SavoyStemmer
from ptstemmer.implementations.PorterStemmer import PorterStemmer

app = Flask(__name__)
stemmer = OrengoStemmer()
stemmer.enableCaching(1000)  #Optional


@app.route('/')
def main():
    return '''
            <html>
                <head><title>Stemming Words</title></head>
                <body>
                <p>
                    <h3>Saiba mais...</h3>
                    <ul>
                        <li><b>Para testar acesse a rota:</b> /steam?word=digite_a_palavra_desejada</li>
                        <li>
                            <a href='https://github.com/ednilsonmcs/apistemmer'>Repo no Git</a>
                        </li>
                        <li>
                            <a href='https://www.linkedin.com/in/ednilsonmcs/'>Meu linkedin</a>
                        </li>
                    </ul>
                </p>
                </body>
예제 #7
0
 def __init__(self):
     self.stopWords = []
     self.stopWordsOnlyASCIICharacteres = False
     self.stemmer = OrengoStemmer()
     self.stemmerType = "orengo"
예제 #8
0
#!/usr/bin/env python
"""
 * PTStemmer - A Stemming toolkit for the Portuguese language (C) 2008-2010 Pedro Oliveira
 * 
 * This file is part of PTStemmer.
 * PTStemmer is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * PTStemmer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with PTStemmer. If not, see <http://www.gnu.org/licenses/>.
"""
from ptstemmer.implementations.OrengoStemmer import OrengoStemmer
from ptstemmer.implementations.SavoyStemmer import SavoyStemmer
from ptstemmer.implementations.PorterStemmer import PorterStemmer

if __name__ == "__main__":
    s = OrengoStemmer()
    # s = PorterStemmer()
    # s = SavoyStemmer()
    s.enableCaching(1000)
    s.ignore(["a", "e"])
    print s.getWordStem("extremamente")
예제 #9
0
        text = text.replace("Ò","o")
        text = text.replace("Õ","o")
        text = text.replace("Ô","o")
        text = text.replace("Ú","u")
        text = text.replace("Ù","u")
        text = text.replace("Û","u")
        text = text.replace("Ü","u")
        return text

if __name__ == "__main__":

    utl = Utils()

    input_file_name = 'hcr-train'
    lang = 'en'
    stemmer = OrengoStemmer()
    stemmer.enableCaching(1000)

    input_file = open("Data_sets/"+input_file_name+".csv", "r")

    corpus = []
    annotation = []

    for line in input_file:
        vec = line.split(';')
        annotation.append(vec[1].replace('\n',''))
        vec[0] = vec[0].lower()
        vec[0] = utl.remove_marks(vec[0])
        vec[0] = utl.replace_mentions(vec[0])
        vec[0] = utl.delete_links(vec[0])
        vec[0] = stem(vec[0])
예제 #10
0
#!/usr/bin/env python
'''
 * PTStemmer - A Stemming toolkit for the Portuguese language (C) 2008-2010 Pedro Oliveira
 * 
 * This file is part of PTStemmer.
 * PTStemmer is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * PTStemmer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with PTStemmer. If not, see <http://www.gnu.org/licenses/>.
'''
from ptstemmer.implementations.OrengoStemmer import OrengoStemmer
from ptstemmer.implementations.SavoyStemmer import SavoyStemmer
from ptstemmer.implementations.PorterStemmer import PorterStemmer
from ptstemmer.support import PTStemmerUtilities

if __name__ == '__main__':
    s = OrengoStemmer() #or PorterStemmer or SavoyStemmer
    s.enableCaching(1000)
    s.ignore(PTStemmerUtilities.fileToSet(""))
    stem = s.getWordStem("ciências")
    print PTStemmerUtilities.removeDiacritics(stem)
    print s.getWordStem("extremamente")