def only_stems(keywords): st = PorterStemmer() os = OrengoStemmer() ss = SavoyStemmer() rs = RSLPStemmer() stem1 = [st.getWordStem(x.encode('utf8')) for x in keywords] stem2 = [rs.stem(x.encode('utf8')) for x in keywords] stem3 = [os.getWordStem(x.encode('utf8')) for x in keywords] stem4 = [ss.getWordStem(x.encode('utf8')) for x in keywords] return stem1+stem2+stem3+stem4
def stem(caller, word): global _orengostemmer lang = getattr(caller, "lang", "en") if lang == "en": return porter2.stem(word) elif lang == "pt": if _orengostemmer is None: from ptstemmer.implementations.OrengoStemmer import OrengoStemmer _orengostemmer = OrengoStemmer() return _orengostemmer.getWordStem(word) else: return word
def __setup(self): utl = Utils() stemmer = OrengoStemmer() stemmer.enableCaching(1000) input_file = open("Data_sets/"+self.__file_name+".csv", "r") annotation = [] annotation_int = [] for line in input_file: vec = line.split(';') annotation.append(vec[1].replace('\n','')) annotation_int.append(int(vec[1].replace('\n',''))) vec[0] = vec[0].lower() vec[0] = utl.remove_marks(vec[0]) vec[0] = utl.replace_mentions(vec[0]) vec[0] = utl.delete_links(vec[0]) phrase = '' for elem in vec[0].split(' '): if(self.__lang == 'en'): elem = stem(elem) if(self.__lang == 'pt'): elem = stemmer(elem) phrase = phrase+' '+elem self.__corpus.append(phrase.replace('\n','')) self.__number_of_examples = len(self.__corpus) transform = self.__vectorizer.fit_transform(self.__corpus) feature_list = self.__vectorizer.get_feature_names() transform_binary = self.__vectorizer_binary.fit_transform(self.__corpus) self.__feature_vector_len = len(feature_list) self.__X = self.__vectorizer.transform(self.__corpus) self.__X_binary = self.__vectorizer_binary.transform(self.__corpus).toarray().tolist() self.__y = annotation self.__y_int = annotation_int self.__set_of_classes = set(annotation)
def _getStemmerObject(self, language="pt-br", approach="orengo"): if (approach != self.stemmerType): self.stemmerType = approach if approach == "orengo": self.stemmer = OrengoStemmer() if approach == "porter": self.stemmer = PorterStemmer() if approach == "savoy": self.stemmer = SavoyStemmer() return self.stemmer
# -*- coding: LATIN-1 -*- ''' * PTStemmer - A Stemming toolkit for the Portuguese language (C) 2008-2010 Pedro Oliveira * * This file is part of PTStemmer. * PTStemmer is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * PTStemmer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with PTStemmer. If not, see <http://www.gnu.org/licenses/>. ''' from ptstemmer.implementations.OrengoStemmer import OrengoStemmer from ptstemmer.implementations.SavoyStemmer import SavoyStemmer from ptstemmer.implementations.PorterStemmer import PorterStemmer from ptstemmer.support import PTStemmerUtilities if __name__ == '__main__': s = OrengoStemmer() #or PorterStemmer or SavoyStemmer s.enableCaching(1000) s.ignore(PTStemmerUtilities.fileToSet("")) stem = s.getWordStem("ciências") print(PTStemmerUtilities.removeDiacritics(stem)) print(s.getWordStem("extremamente"))
import os import json from flask import Flask, jsonify, request from ptstemmer.implementations.OrengoStemmer import OrengoStemmer from ptstemmer.implementations.SavoyStemmer import SavoyStemmer from ptstemmer.implementations.PorterStemmer import PorterStemmer app = Flask(__name__) stemmer = OrengoStemmer() stemmer.enableCaching(1000) #Optional @app.route('/') def main(): return ''' <html> <head><title>Stemming Words</title></head> <body> <p> <h3>Saiba mais...</h3> <ul> <li><b>Para testar acesse a rota:</b> /steam?word=digite_a_palavra_desejada</li> <li> <a href='https://github.com/ednilsonmcs/apistemmer'>Repo no Git</a> </li> <li> <a href='https://www.linkedin.com/in/ednilsonmcs/'>Meu linkedin</a> </li> </ul> </p> </body>
def __init__(self): self.stopWords = [] self.stopWordsOnlyASCIICharacteres = False self.stemmer = OrengoStemmer() self.stemmerType = "orengo"
#!/usr/bin/env python """ * PTStemmer - A Stemming toolkit for the Portuguese language (C) 2008-2010 Pedro Oliveira * * This file is part of PTStemmer. * PTStemmer is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * PTStemmer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with PTStemmer. If not, see <http://www.gnu.org/licenses/>. """ from ptstemmer.implementations.OrengoStemmer import OrengoStemmer from ptstemmer.implementations.SavoyStemmer import SavoyStemmer from ptstemmer.implementations.PorterStemmer import PorterStemmer if __name__ == "__main__": s = OrengoStemmer() # s = PorterStemmer() # s = SavoyStemmer() s.enableCaching(1000) s.ignore(["a", "e"]) print s.getWordStem("extremamente")
text = text.replace("Ò","o") text = text.replace("Õ","o") text = text.replace("Ô","o") text = text.replace("Ú","u") text = text.replace("Ù","u") text = text.replace("Û","u") text = text.replace("Ü","u") return text if __name__ == "__main__": utl = Utils() input_file_name = 'hcr-train' lang = 'en' stemmer = OrengoStemmer() stemmer.enableCaching(1000) input_file = open("Data_sets/"+input_file_name+".csv", "r") corpus = [] annotation = [] for line in input_file: vec = line.split(';') annotation.append(vec[1].replace('\n','')) vec[0] = vec[0].lower() vec[0] = utl.remove_marks(vec[0]) vec[0] = utl.replace_mentions(vec[0]) vec[0] = utl.delete_links(vec[0]) vec[0] = stem(vec[0])
#!/usr/bin/env python ''' * PTStemmer - A Stemming toolkit for the Portuguese language (C) 2008-2010 Pedro Oliveira * * This file is part of PTStemmer. * PTStemmer is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * PTStemmer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with PTStemmer. If not, see <http://www.gnu.org/licenses/>. ''' from ptstemmer.implementations.OrengoStemmer import OrengoStemmer from ptstemmer.implementations.SavoyStemmer import SavoyStemmer from ptstemmer.implementations.PorterStemmer import PorterStemmer from ptstemmer.support import PTStemmerUtilities if __name__ == '__main__': s = OrengoStemmer() #or PorterStemmer or SavoyStemmer s.enableCaching(1000) s.ignore(PTStemmerUtilities.fileToSet("")) stem = s.getWordStem("ciências") print PTStemmerUtilities.removeDiacritics(stem) print s.getWordStem("extremamente")