Exemplo n.º 1
0
    def stem_words(self, words: List[str]) -> List[str]:
        """Stem list of words with PyStemmer."""
        language_code = self.language_code()
        words = decode_object_from_bytes_if_needed(words)

        # Normalize apostrophe so that "it’s" and "it's" get treated identically (it's being done in
        # _tokenize_with_spaces() too but let's not assume that all tokens that are to be stemmed go through sentence
        # tokenization first)
        words = [word.replace("’", "'") for word in words]

        if language_code is None:
            raise McLanguageException("Language code is None.")

        if words is None:
            raise McLanguageException("Words to stem is None.")

        # (Re-)initialize stemmer if needed
        if self.__pystemmer is None:

            try:
                self.__pystemmer = PyStemmer(language_code)
            except Exception as ex:
                raise McLanguageException(
                    "Unable to initialize PyStemmer for language '%s': %s" % (
                        language_code,
                        str(ex),
                    ))

        stems = self.__pystemmer.stemWords(words)

        if len(words) != len(stems):
            log.warning(
                "Stem count is not the same as word count; words: %s; stems: %s"
                % (
                    str(words),
                    str(stems),
                ))

        # Perl's Snowball implementation used to return lowercase stems
        stems = [stem.lower() for stem in stems]

        return stems
Exemplo n.º 2
0
Arquivo: en.py Projeto: th0/test2
 def __init__(self):
     self.stemmer = PyStemmer('porter')
Exemplo n.º 3
0
from __future__ import print_function
import xml.etree.ElementTree as etree
import re, os, heapq, math, operator, string, time, sys
from collections import *
from Stemmer import Stemmer as PyStemmer
import glob

reload(sys)
sys.setdefaultencoding('utf-8')
ps = PyStemmer('porter')

if (len(sys.argv[1:]) < 1):
    print("Needs 1 argument, the index directory")
    sys.exit()

indexDirPth = sys.argv[1]
# qryTxtFlPth = sys.argv[2]
# outTxtFlPth = sys.argv[3]

# if not os.path.exists(outTxtFlPth):
#     with open(outTxtFlPth, 'w+'): pass
# else:
# 	open(outTxtFlPth, 'w').close()

absltPthCurrPrgrm = os.path.abspath(os.path.dirname(sys.argv[0]))
###########################################################################

stopwords = dict()
inverted_index_file, mapping, doc_offset = list(), list(), list()
inverted_index_file.append(
    open(os.path.join(indexDirPth, 'title/final.txt'), 'r'))
Exemplo n.º 4
0
 def __init__(self):
     # type: () -> None
     self.stemmer = PyStemmer('porter')