Пример #1
0
def only_stems(keywords):
    st = PorterStemmer()
    os = OrengoStemmer()
    ss = SavoyStemmer()

    rs = RSLPStemmer()
    
    stem1 = [st.getWordStem(x.encode('utf8')) for x in keywords]
    stem2 = [rs.stem(x.encode('utf8')) for x in keywords]
    stem3 = [os.getWordStem(x.encode('utf8')) for x in keywords]
    stem4 = [ss.getWordStem(x.encode('utf8')) for x in keywords]

    return stem1+stem2+stem3+stem4
Пример #2
0
def stem(caller, word):
	global _orengostemmer

	lang = getattr(caller, "lang", "en")
	if lang == "en":
		return porter2.stem(word)
	elif lang == "pt":
		if _orengostemmer is None:
			from ptstemmer.implementations.OrengoStemmer import OrengoStemmer
			_orengostemmer = OrengoStemmer()
		return _orengostemmer.getWordStem(word)
	else:
		return word
Пример #3
0
# -*- coding: LATIN-1 -*-
'''
 * PTStemmer - A Stemming toolkit for the Portuguese language (C) 2008-2010 Pedro Oliveira
 * 
 * This file is part of PTStemmer.
 * PTStemmer is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * PTStemmer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with PTStemmer. If not, see <http://www.gnu.org/licenses/>.
'''
from ptstemmer.implementations.OrengoStemmer import OrengoStemmer
from ptstemmer.implementations.SavoyStemmer import SavoyStemmer
from ptstemmer.implementations.PorterStemmer import PorterStemmer
from ptstemmer.support import PTStemmerUtilities

if __name__ == '__main__':
    s = OrengoStemmer()  #or PorterStemmer or SavoyStemmer
    s.enableCaching(1000)
    s.ignore(PTStemmerUtilities.fileToSet(""))
    stem = s.getWordStem("ciências")
    print(PTStemmerUtilities.removeDiacritics(stem))
    print(s.getWordStem("extremamente"))
Пример #4
0
#!/usr/bin/env python
"""
 * PTStemmer - A Stemming toolkit for the Portuguese language (C) 2008-2010 Pedro Oliveira
 * 
 * This file is part of PTStemmer.
 * PTStemmer is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * PTStemmer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with PTStemmer. If not, see <http://www.gnu.org/licenses/>.
"""
from ptstemmer.implementations.OrengoStemmer import OrengoStemmer
from ptstemmer.implementations.SavoyStemmer import SavoyStemmer
from ptstemmer.implementations.PorterStemmer import PorterStemmer

if __name__ == "__main__":
    s = OrengoStemmer()
    # s = PorterStemmer()
    # s = SavoyStemmer()
    s.enableCaching(1000)
    s.ignore(["a", "e"])
    print s.getWordStem("extremamente")
Пример #5
0
#!/usr/bin/env python
'''
 * PTStemmer - A Stemming toolkit for the Portuguese language (C) 2008-2010 Pedro Oliveira
 * 
 * This file is part of PTStemmer.
 * PTStemmer is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * PTStemmer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with PTStemmer. If not, see <http://www.gnu.org/licenses/>.
'''
from ptstemmer.implementations.OrengoStemmer import OrengoStemmer
from ptstemmer.implementations.SavoyStemmer import SavoyStemmer
from ptstemmer.implementations.PorterStemmer import PorterStemmer
from ptstemmer.support import PTStemmerUtilities

if __name__ == '__main__':
    s = OrengoStemmer() #or PorterStemmer or SavoyStemmer
    s.enableCaching(1000)
    s.ignore(PTStemmerUtilities.fileToSet(""))
    stem = s.getWordStem("ciências")
    print PTStemmerUtilities.removeDiacritics(stem)
    print s.getWordStem("extremamente")