Python stemmer示例，snowballstemmer.stemmer Python示例

示例#1

1

显示文件

文件： News_pre_l.py 项目： ZanW/Python

def preprocess_document(data):
    # Step 1: strip punctuation
    data = data.lower()
    punctuation = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']'
    , '{', '}', '#', '\\','/','@','\xa0','\n','&','$','‘','…','•','-'] 
    for punc in punctuation:
        data = data.replace(punc, '')
        
    # Step 2: tokenize 
    data = list(nltk.word_tokenize(data))
    
    # Step 3: strip stopwords
    stop = set(stopwords.words('english'))
    extra_stopwords = ['ok', 'oh', 'via','bc','gon','na'] # add any additional stopwords we want to use here
    stop.update(extra_stopwords)
    stop.update(list(string.ascii_lowercase)) # remove all single letters
    data = [i for i in data if i not in stop] # remove stopwords and sort result
    
    # Step 4: stemming
    stemmer = snowballstemmer.stemmer('english') 
    data = stemmer.stemWords(data)
    
    # Step 5: remove words not in NLTK english corpus
    words = set(nltk.corpus.words.words())
     for w in data:
        if w not in words:
            data.remove(w)

示例#2

0

显示文件

文件： textsearch.py 项目： amand4msk/Web_Engineering_Project-

 def __init__(self, german):
     path = os.path.dirname(os.path.abspath(__file__))
     print(path)
     self.IX = open_dir(path + "/index")
     self.Writer = self.IX.writer()
     
     if german == True:
         self.Stemmer = snowballstemmer.stemmer('german')
     else:
         self.Stemmer = snowballstemmer.stemmer('french')

示例#3

0

显示文件

文件： cutWords.py 项目： kelly2016/nlp_assignments

    def cut2list(self, string):
        """
        返回list
        :param string:
        :return:
        """

        tokens = []
        if self.replaceP == True:
            sens = split(string,
                         '' if self.type == Analyzer.ANALYZERS.Jieba else ' ')
        else:
            sens = [string]  #[strB2Q(string)]
        for sen in sens:
            if self.type == Analyzer.ANALYZERS.Jieba:
                # 使用jieba进行分词
                words = self.analyzer.cut(sen, cut_all=False)
            elif self.type == Analyzer.ANALYZERS.nltk:
                #使用英文进行分词
                sen = sen.lower()
                words = self.analyzer.word_tokenize(sen)
                stemmer = snowballstemmer.stemmer('english')
                # 参数是选择的语言
                words = stemmer.stemWords(words)

            if self.useStopwords == True:
                for word in words:
                    if word not in stopwords and len(word.strip()) > 0:
                        tokens.append(word)
            else:
                tokens += words
        return tokens

示例#4

0

显示文件

文件： Highlighter_Articles.py 项目： Institute-Web-Science-and-Technologies/westcat

	def getHighlightingsVariables(self, article, variable_keywords, variable_pages):
		stemmer = snowballstemmer.stemmer("german")
		#goodchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÖÜäöüß'"
		for i in range(0, len(article)):
			for j in range(0, len(article[i])):
  				article[i][j] = article[i][j].split(" ");
				for k in range(0, len(article[i][j])):
					#article[i][j][k]=chrtran(article[i][j][k], goodchars, "")
					article[i][j][k]=stemmer.stemWord(article[i][j][k])


		for i in range(0, len(variable_keywords)):
			#variable_keywords[i]=chrtran(variable_keywords[i], goodchars, "")
			variable_keywords[i]=stemmer.stemWord(variable_keywords[i])

		highlight = []

		for i in range(0, len(article)):
			highlight_article = []
	
			for j in range(0, len(article[i])):
				highlight_variables = []
				for k in range(0, len(variable_keywords)):
	  				highlight_variables.append(random.random())
				highlight_article.append(highlight_variables)

			highlight.append(highlight_article)
			


	 	return highlight

示例#5

0

显示文件

文件： corpus_stats.py 项目： justincely/classwork

def clean(text, stemmer='snowball'):
    """Normalize, split, and clean text

    Parameters:
    -----------
    text : str
        Block of text to clean and prepare.
    stemmer : str, opt
        Stemmer to use: [snowball, five, simple]

    Returns:
    --------
    text : str
        Cleaned and prepared text block.
    """

    if not stemmer in ['snowball', 'five', 'simple', 'none']:
        raise ValueError("Stemmer choice not available.")

    text = re.sub("[{}]".format(string.punctuation), " ", text.lower())
    text = text.split()

    if stemmer == 'five':
        text = [five_stemmer(item) for item in text]
    elif stemmer == 'snowball':
        stemmer = snowballstemmer.stemmer('english');
        text = stemmer.stemWords(text)
    elif stemmer == 'simple':
        text = [simple_stem(item) for item in text]
    else:
        pass

    text = [item for item in text if not item in STOP_WORDS]

    return text

示例#6

0

显示文件

def seeker_highlight(text, query, algorithm='english'):
    if not query:
        return mark_safe(seeker_format(text))
    try:
        import snowballstemmer
        stemmer = snowballstemmer.stemmer(algorithm)
        stemWord = stemmer.stemWord
        stemWords = stemmer.stemWords
    except:
        stemWord = lambda word: word
        stemWords = lambda words: words
    phrases = _phrase_re.findall(query)
    keywords_q = [
        w.lower() for w in re.split(r'\W+', _phrase_re.sub('', query)) if w
    ]
    highlight = set(stemWords(keywords_q))
    text = seeker_format(text)
    for phrase in phrases:
        text = re.sub('(' + re.escape(phrase) + ')',
                      r'<em>\1</em>',
                      text,
                      flags=re.I)
    parts = []
    for word in re.split(r'(\W+)', text):
        if stemWord(word.lower()) in highlight:
            parts.append('<em>%s</em>' % word)
        else:
            parts.append(word)
    return mark_safe(''.join(parts))

示例#7

0

显示文件

文件： cutWords.py 项目： kelly2016/nlp_assignments

    def cut(self, string):
        """
        用分词器切词并用空格隔开
        :param string:
        :return: 返回格式是字符串
        """

        article_contents = ''
        sens = ''
        if self.replaceP == True:
            sens = split(string)
        else:
            sens = [string]  #strB2Q(string)
        for sen in sens:
            if self.type == Analyzer.ANALYZERS.Jieba:
                # 使用jieba进行分词
                words = self.analyzer.cut(sen, cut_all=False)
            elif self.type == Analyzer.ANALYZERS.nltk:
                #使用英文进行分词

                sen = sen.lower()
                words = self.analyzer.word_tokenize(sen)
                stemmer = snowballstemmer.stemmer('english')
                # 参数是选择的语言
                words = stemmer.stemWords(words)

            if self.useStopwords == True:
                for word in words:
                    if word not in stopwords and len(word.strip()) > 0:
                        article_contents += word + " "
            else:
                article_contents = ' '.join(words)

        return article_contents

示例#8

0

显示文件

 def __init__(self, language="es"):
     """
     Init method
     :param language: input language
     """
     self.__stemmer = snowballstemmer.stemmer("spanish")
     Token.set_extension("stem", default="", force=True)

示例#9

0

显示文件

def do_work(*args):
    import snowballstemmer
    stemmer = snowballstemmer.stemmer('english')
    print(js.data.textdata)
    txt = js.data.textdata
    newval = stemmer.stemWords(txt.split())
    return newval

示例#10

0

显示文件

def stemming(lang, input, output, encoding, pretty):
    result = []
    stemmer = snowballstemmer.stemmer(lang)
    for original in codecs.open(input, "r", encoding).readlines():
        original = original.strip()
        # Convert only ASCII-letters to lowercase, to match C behavior
        original = ''.join(
            (lower_(c) if 'A' <= c <= 'Z' else c for c in original))
        stemmed = stemmer.stemWord(original)
        if result:
            result.append('\n')
        if pretty == 0:
            if stemmed != "":
                result.append(stemmed)
        elif pretty == 1:
            result.append(original, " -> ", stemmed)
        elif pretty == 2:
            result.append(original)
            if len(original) < 30:
                result.append(" " * (30 - len(original)))
            else:
                result.append("\n")
                result.append(" " * 30)
            result.append(stemmed)
    outfile = codecs.open(output, "w", encoding)
    outfile.write(''.join(result) + '\n')
    outfile.close()

示例#11

0

显示文件

def do_semantic_analysis(sentence):
    sentence_probability_of_negative = 1
    sentence_probability_of_positive = 1
    stem = stemmer('turkish')

    stopwords_file = open("cookit.pythonanywhere.com/comments/text_files/stopwords.txt", "r").read()
    stopwords_list = stopwords_file.split("\n")
    words_list = sentence.split(" ")
    for word in words_list:
        word = re.sub(r'[^\w\s]', '', word)
        word = word.lower()
        x = [word]
        word = stem.stemWords(x)[0]
        if word in stopwords_list:
            continue
        else:
            try:
                word_probability_of_negative = ProbabilityOfWords.objects.get(word=word).probabilityOfNegative
            except ProbabilityOfWords.DoesNotExist:
                word_probability_of_negative = 1
            try:
                word_probability_of_positive = ProbabilityOfWords.objects.get(word=word).probabilityOfPositive
            except ProbabilityOfWords.DoesNotExist:
                word_probability_of_positive = 1

            sentence_probability_of_negative *= word_probability_of_negative
            sentence_probability_of_positive *= word_probability_of_positive

    if sentence_probability_of_positive > sentence_probability_of_negative:
        result = "positive"
    elif sentence_probability_of_positive < sentence_probability_of_negative:
        result = "negative"
    else:
        result = "notr"
    return result

示例#12

0

显示文件

文件： my_preprocess.py 项目： nozlemozcan/qual_tunga

def my_separate_samples(read_input_lines, stem_flag):

    input_splitted_list = []
    input_class_list = []

    if stem_flag == '1':
        print('stemmer')
        my_stemmer = sb.stemmer('turkish')

    for curr_line in read_input_lines:
        curr_line2 = curr_line.lower()
        exclude = string.punctuation
        curr_line3 = ''.join(ch for ch in curr_line2 if ch not in exclude)
        curr_line4 = curr_line3.split('\t')
        curr_sample = curr_line4[0].split()
        curr_sample = list(set(curr_sample))
        curr_class = curr_line4[1].replace('\n', '')

        if stem_flag == '1':
            stemmed_curr_sample = []
            for wt in curr_sample:
                if len(wt) > 5:
                    stemmed_curr_sample.append(my_stemmer.stemWord(wt))
                else:
                    stemmed_curr_sample.append(wt)
            curr_sample = stemmed_curr_sample

        input_splitted_list.append(curr_sample)
        input_class_list.append(curr_class)

    return input_splitted_list, input_class_list

示例#13

0

显示文件

文件： __init__.py 项目： jfbu/sphinx

 def __init__(self) -> None:
     warnings.warn(
         f"{self.__class__.__name__} is deprecated, use "
         "snowballstemmer.stemmer('porter') instead.",
         RemovedInSphinx70Warning,
         stacklevel=2)
     self.stemmer = snowballstemmer.stemmer('porter')

示例#14

0

显示文件

def WordTabLemma(fin, fout):
    '''Convert one word per line format to word-tab-lemma per line format.'''
    stemmer = snowballstemmer.stemmer('english')
    with open(fin, 'rt') as fi, open(fout, 'wt') as fo:
        for word in fi:
            word = word.strip()
            fo.write("{}\t{}\n".format(word, stemmer.stemWord(word)))

示例#15

0

显示文件

文件： tokenize.py 项目： lingorithm/llck

    def __name_follows(self, token):
        """split the token based on letters start with

        Args:
            token (str): a word

        Returns:
            list: splited word
        """
        follows = [
            '\u0628',  # ب
            '\u0643',  # ك
            '\u0644',  # ل
            '\u0648',  # و
            '\u062a',  # ت
            '\u0633'
        ]
        stem = stemmer("arabic").stemWord(token)
        for follow in follows:
            if token.startswith(follow) and not stem.startswith(follow):
                token = re.sub(follow,
                               r'\g<0><SPLIT>',
                               token,
                               flags=re.UNICODE)
        return token.split("<SPLIT>")

示例#16

0

显示文件

文件： proyecto.py 项目： giovanniani/RIT-Proyecto2

def stemmer(pList):
    stemmer = snowballstemmer.stemmer('spanish')
    stemmedWords = set([])
    for word in pList:
        stemmed = stemmer.stemWord(word)
        stemmedWords.add(word)
    return stemmedWords

示例#17

0

显示文件

文件： stemwords.py 项目： xjzhou/snowball

def stemming(lang, input, output, encoding, pretty):
    result = []
    stemmer = snowballstemmer.stemmer(lang)
    for original in codecs.open(input, "r", encoding).readlines():
        original = original.strip()
        # Convert only ASCII-letters to lowercase, to match C behavior
        original = ''.join((lower_(c) if 'A' <= c <= 'Z' else c for c in original))
        stemmed = stemmer.stemWord(original)
        if result:
            result.append('\n')
        if pretty == 0:
            if stemmed != "":
                result.append(stemmed)
        elif pretty == 1:
            result.append(original, " -> ", stemmed)
        elif pretty == 2:
            result.append(original)
            if len(original) < 30:
                result.append(" " * (30 - len(original)))
            else:
                result.append("\n")
                result.append(" " * 30)
            result.append(stemmed)
    outfile = codecs.open(output, "w", encoding)
    outfile.write(''.join(result) + '\n')
    outfile.close()

示例#18

0

显示文件

文件： zh.py 项目： varphone/sphinx

    def init(self, options: Dict) -> None:
        if JIEBA:
            dict_path = options.get('dict')
            if dict_path and os.path.isfile(dict_path):
                jieba.load_userdict(dict_path)

        self.stemmer = snowballstemmer.stemmer('english')

示例#19

0

显示文件

文件： corpus_stats.py 项目： Hassan0111/classwork

def clean(text, stemmer='snowball'):
    """Normalize, split, and clean text

    Parameters:
    -----------
    text : str
        Block of text to clean and prepare.
    stemmer : str, opt
        Stemmer to use: [snowball, five, simple]

    Returns:
    --------
    text : str
        Cleaned and prepared text block.
    """

    if not stemmer in ['snowball', 'five', 'simple', 'none']:
        raise ValueError("Stemmer choice not available.")

    text = re.sub("[{}]".format(string.punctuation), " ", text.lower())
    text = text.split()

    if stemmer == 'five':
        text = [five_stemmer(item) for item in text]
    elif stemmer == 'snowball':
        stemmer = snowballstemmer.stemmer('english')
        text = stemmer.stemWords(text)
    elif stemmer == 'simple':
        text = [simple_stem(item) for item in text]
    else:
        pass

    text = [item for item in text if not item in STOP_WORDS]

    return text

示例#20

0

显示文件

文件： highlight.py 项目： flaxsearch/highlighter

 def __init__(self, language=None):
     """Create a new highlighter for the specified language.
     
     """
     if language:
         self.stem = snowballstemmer.stemmer(language)
     else:
         self.stem = NoStem()

示例#21

0

显示文件

文件： main.py 项目： bwalkowi/mownit

def gen_words(text, stemming=stem.stemmer('english')):
    """Create generator.
    :param text: some string
    :param stemming: variant of stemming algorithm
    :return: generator giving stemmed words from text
    """
    for word in stemming.stemWords(re.findall(r"[\w']+", text.lower())):
        yield word

示例#22

0

显示文件

文件： stemtokstop.py 项目： Jiannan28/stemtokstop

def turkish(sent):
    # No turkish stemmer in NLTK
    stem = snowballstemmer.stemmer('turkish')
    stop = stopwords.words('turkish')
    tx  = word_tokenize(sent)
    mx = stem.stemWords(tx)
    px = [x for x in mx if x not in stop]
    return px

示例#23

0

显示文件

文件： highlight.py 项目： flaxsearch/highlighter

 def __init__(self, language=None):
     """Create a new highlighter for the specified language.
     
     """
     if language:
         self.stem = snowballstemmer.stemmer(language)
     else:
         self.stem = NoStem()

示例#24

0

显示文件

文件： stemtokstop.py 项目： thanhtd91/stemtokstop

def turkish(sent):
    # No turkish stemmer in NLTK
    stem = snowballstemmer.stemmer('turkish')
    stop = stopwords.words('turkish')
    tx = word_tokenize(sent)
    mx = stem.stemWords(tx)
    px = [x for x in mx if x not in stop]
    return px

示例#25

0

显示文件

 def __init__(self, N=8):
     """
     Create the object
     :param int N: max length of the suffix used in the rules
     """
     self.N = N
     self._rules = [None] * (N + 1)
     self._stemmer = sbs.stemmer('italian')

示例#26

0

显示文件

文件： utils.py 项目： KostasKoutrou/Spark-Hybrid-ML-Phishing-Emails

def stem2(in_vec):
    stemmer = snowballstemmer.stemmer('english')
    out_vec = []
    for x in in_vec:
        to_out = stemmer.stemWord(x)
        if(len(to_out) > 2):
            out_vec.append(to_out)
    return out_vec

示例#27

0

显示文件

def get_feature_base(sentence):
    stemmer = snowballstemmer.stemmer('english')
    words = sentence.split()
    result = []
    for word in words:
        if is_stopword(word):
            continue
        result.append(stemmer.stemWord(word))
    return ' '.join(result)

示例#28

0

显示文件

def get_stemmer(language):
    stemmer_languages = [
        "danish", "dutch", "english", "finnish", "french", "german",
        "hungarian", "italian", "norwegian", "portuguese", "romanian",
        "russian", "spanish", "swedish", "turkish"
    ]
    if language.lower() in stemmer_languages:
        return lambda word: snowballstemmer.stemmer(language).stemWord(word)
    return lambda word: word

示例#29

0

显示文件

文件： __init__.py 项目： dudarev/quex

def stem_and_lower(str_):
    """
    Returns string with unique lowercase words stemmed.
    """
    stemmer = snowballstemmer.stemmer(config.LANGUAGE_FULL)
    str_no_punctuation = str_.translate(REMOVE_PUNCTUATION_MAP)
    str_stemmed = stemmer.stemWords(
        map(lambda x: x.lower(), set(str_no_punctuation.split())))
    return ' '.join(str_stemmed)

示例#30

0

显示文件

文件： archivo_invertido.py 项目： 201265615/TP2_RIT_II15_PY

def aplicarStemmer(pDictPalabrasArchivos):
    print("aplicando stemming...")
    dictRaices = {}
    stemmer = snowballstemmer.stemmer("spanish")
    for docId, palabras in pDictPalabrasArchivos.items():
        raices = stemmer.stemWords(palabras)
        dictRaices[docId] = raices
    ##    archivo.archivo.crearCSVDict(".\stemming.csv",dictRaices)
    return dictRaices

示例#31

0

显示文件

def rootsoftheliturgical(words):
    words=words.lower()
    rootfind = stemmer('turkish')
    trans=str.maketrans('', '', punctuation)
    words = words.translate(trans)
    words=StopWords(words)
   # letters = words.split()
    letters = rootfind.stemWords(words)
    string =' '.join(letters)
    return string

示例#32

0

显示文件

文件： stemmer.py 项目： Sentimentron/Nebraska-public

 def __init__(self, xml):
     self.dest = xml.get("dest")
     if self.dest is None:
         raise ValueError()
     self.verbose = xml.get("verbose")
     if self.verbose is None:
         self.verbose = False
     else:
         self.verbose = True
     self.stemmer =  snowballstemmer.stemmer('english')

示例#33

0

显示文件

文件： knock52.py 项目： tmu-nlp/100knock2019

def main():
    stemmer = snowballstemmer.stemmer('english')  # stemmingモジュール読み込み
    for i, line in enumerate(sentence_extraction()):
        if i == 10:
            break
        words = line.strip('\n').split(' ')
        for word in words:
            # 各単語に対して、stemmer.stemWord(word)でステミング処理
            print('{}\t{}'.format(word, stemmer.stemWord(word)))
        print('\n')

示例#34

0

显示文件

文件： token_filters.py 项目： fxdgear/pyconcz_2019

def snowball_tokenfilter(token):
    """
    Snowball token filter

    uses the Snowball stemming library collection for python:
      https://github.com/shibukawa/snowball_py
    """
    stemmer = snowballstemmer.stemmer("english")
    token["token"] = stemmer.stemWord(token["token"])
    return token

示例#35

0

显示文件

文件： features.py 项目： Check4facts/Check4Facts

    def __init__(self, **kwargs):
        self.basic_params = kwargs['basic']
        self.emb_params = kwargs['embeddings']
        self.sim_params = kwargs['similarity']
        self.subj_params = kwargs['subjectivity']
        self.sent_params = kwargs['sentiment']
        self.emo_params = kwargs['emotion']

        self.nlp = spacy.load(self.basic_params['model'])
        self.stemmer = stemmer('greek')
        self.lexicon_ = None

示例#36

0

显示文件

文件： tr_stemmer.py 项目： burakyldrm/calisma

def stem2(word):

    stemmer = snowballstemmer.stemmer("turkish")
    stemmed = stemmer.stemWord(word)

    if stemmed == "fatur":
        stemmed = "fatura"
    elif stemmed == "hatt":
        stemmed = "hat"

    return stemmed

示例#37

0

显示文件

文件： utils_functions.py 项目： dujm/Health_PrecisionMedicine

def clean_text_stemmed(t):
    """Accepts a Document
    """
    t = t.lower()
    # Remove single characters
    t = re.sub("[^A-Za-z0-9]", " ", t)
    # Replace all numbers by a single char
    t = re.sub("[0-9]+", "#", t)
    stemmer = snowballstemmer.stemmer('english')
    tfinal = " ".join(stemmer.stemWords(t.split()))
    return t

示例#38

0

显示文件

文件： predict.py 项目： kez103/Hackaton

def text_cleaner(text):
    stemmer = snowballstemmer.stemmer('russian')
    text = text.lower()  # приведение в lowercase,
    text = re.sub(r'https?://[\S]+', ' url ', text)  # замена интернет ссылок
    text = re.sub(r'[\w\./]+\.[a-z]+', ' url ', text)
    text = re.sub(r'<[^>]*>', ' ', text)  # удаление html тагов
    text = re.sub(r'[\W\n]+', ' ', text)  # удаление лишних символов
    text = re.sub(r'\w*\d\w*', '', text)  # замена цифр
    text = re.sub(r'\w*[.]\w*', '', text)  # замена цифр
    text = ' '.join(stemmer.stemWords(text.split()))  # Выделение корней
    return text

示例#39

0

显示文件

文件： consultas.py 项目： 201265615/TP2_RIT_II15_PY

def aplicarStemmerConsulta(pLista):
    #print(pLista)
    print("aplicando stemming...")
    lista = []
    stemmer = snowballstemmer.stemmer('spanish')
    for i in pLista:
        #print(i[0])
        raiz = stemmer.stemWords([i[0]])[0]
        lista.append([raiz,i[1]])
        #print(i[0])
    #print(lista)
    return lista

示例#40

0

显示文件

文件： search_terms.py 项目： Trekafe/trekafe_web

def create_search_terms(string_terms):
  ''' Creates search terms by stemming every word within the parameter passed.
  Returns all search terms in one string separated by space'''
  stemmer = snowballstemmer.stemmer('english')
  terms = stemmer.stemWords(string_terms.split())

  search_term = list()
  for term in terms:
    lower_term = term.lower()
    if not lower_term in _STOP_WORDS:
      search_term.append(lower_term)

  return " ".join(search_term)

示例#41

0

显示文件

文件： words.py 项目： janukobytsch/wikimedia-image-classification

 def __init__(self, samples=None, stopwords="english", limit=20, logging=False):
     """
     Create a vocabulary which is a mapping from bucket names to lists of
     synonyms that fall into their bucket. Stopwords is a list of words that
     are ignored for the vocabulary and defaults to a built-in english
     stopword list.
     """
     self.stopwords = stopwords
     self.stemmer = snowballstemmer.stemmer("english")
     self.tokens = re.compile(r"[A-Z]?[a-z]{2,}")
     self.logging = logging
     if samples:
         self._generate_vocabulary(samples, limit)

示例#42

0

显示文件

文件： views.py 项目： alehat/searchengine

def search_result(request):
    query = request.POST.get('query')
    q_words = query.split()
    stemmed_words = []
    for word in q_words:
        lng = detect(word)
        if lng in LANGUAGES:
            lng = LANGUAGES[lng]
            stemmed_words.append(snowballstemmer.stemmer(lng).stemWord(word))
        else:
            stemmed_words.append(word)

    return render(request, 'searchres/search_result.html', {})

示例#43

0

显示文件

文件： Utils.py 项目： andoniVT/OpinionMiningProject

def getPalabras():
    file = "dicc.txt"

    arc = open(file, 'r')
    stemmer = snowballstemmer.stemmer('spanish');
         
    
    words = {}
    for i in arc:
        i = i.rstrip()
        i = stemmer.stemWord(i)
        words[i] = "word"
    
    for i in words.items():
        print i
    
    print len(words)

示例#44

0

显示文件

文件： prediction.py 项目： vz10/edx_telegram_bot

def get_coursed_and_create_matrix():
    results = [course for course in modulestore().get_courses() if course.scope_ids.block_type == "course"]
    new_matrix = TfidMatrixAllCourses.objects.all().first() or TfidMatrixAllCourses()
    print new_matrix.matrix.shape[0] != len(results)
    if new_matrix.matrix.shape[0] != len(results):
        all_courses = [re.sub("<[^>]*>", "", CourseDetails.fetch_about_attribute(x.id, "overview")) for x in results]

        MatrixEdxCoursesId.objects.all().delete()
        map(lambda x: MatrixEdxCoursesId.objects.create(course_key=x.id, course_index=results.index(x)), results)

        stemmer = snowballstemmer.stemmer("english")
        courses_stem = [" ".join(stemmer.stemWords(x.split())) for x in all_courses]

        vect = TfidfVectorizer(stop_words=get_stop_words(), lowercase=True, dtype=np.float32)
        matrix = vect.fit_transform(courses_stem)
        new_matrix.matrix = matrix
        new_matrix.save()

示例#45

0

显示文件

文件： index_base.py 项目： hymloth/pyredise

 def identify_language(self, text):
     self.lang = lang_mapping[langid.classify(text)[0]]
     if self.debug: print "LANG", self.lang#, "stemmer", self.stem
     
     if self.lang == "greek":
         from stemmers.greek import stem, stopwords 
         self.stem = stem
         self.legal_token = partial(self.legal_token, exclude_list=stopwords)
     elif self.lang == "turkish": # unfortunately, turkish stemmer isnt included in nltk
         import snowballstemmer
         from stemmers.turkish import stopwords 
         self.stem = snowballstemmer.stemmer("turkish").stemWord
         self.legal_token = partial(self.legal_token, exclude_list=stopwords)
     else:
         from nltk.stem import SnowballStemmer
         from nltk.corpus import stopwords
         self.stem = SnowballStemmer(self.lang).stem
         self.legal_token = partial(self.legal_token, exclude_list=stopwords.words(self.lang))

示例#46

0

显示文件

文件： refine-stem.py 项目： bibtex/bibsleigh

def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	if 'title' not in o.json.keys():
		if verbose:
			print('No title in', o.getKey())
		return 1 # no title
	# check for a different language - to avoid stemming altogether
	if o.tags and ('german' in o.tags or 'french' in o.tags or 'portuguese' in o.tags):
		if 'stemmed' in o.json.keys():
			# if stemmed before marked foreign, remove this info
			del o.json['stemmed']
			F = open(fn, 'w')
			F.write(o.getJSON())
			F.close()
			return 2
		else:
			return 0
	changed = False
	### champion variant: snowballstemmer - runs in ~13.5s for 96027 titles
	stemmer = snowballstemmer.stemmer('english').stemWords
	### disregarded variant: snowballstemmer porter - considered outdated
	# stemmer = snowballstemmer.stemmer('porter').stemWords
	### disregarded variant: stemming - too slow, runs in ~33s for 96027 titles
	# stemmer = lambda xs: [stemming.porter2.stem(x) for x in xs]
	### disregarded variant: nltk - worse on verbs ending with -ze
	# stemmer3 = lambda xs: [SnowballStemmer("english").stem(x) for x in xs]
	### end variants
	stemmed = stemmer(string2words(o.get('title')))
	if '' in stemmed:
		print('“{}” is a title of {} and it has an empty word'.format(o.get('title'), C.red(o.getKey())))
		print(string2words(o.get('title')))
		print(stemmer(string2words(o.get('title'))))
	ALLSTEMS.update(stemmed)
	if o.get('stemmed') != stemmed:
		o.json['stemmed'] = stemmed
		changed = True
	if changed:
		F = open(fn, 'w')
		F.write(o.getJSON())
		F.close()
		return 2
	else:
		return 0

示例#47

0

显示文件

文件： testapp.py 项目： Marslo/VimConfig

def main():
    argv = sys.argv
    if len(argv) < 2:
        usage()
        return
    algorithm = 'english'
    if len(argv) > 2:
        algorithm = argv[1]
        argv = argv[2:]
    else:
        argv = argv[1:]
    stemmer = snowballstemmer.stemmer(algorithm)
    splitter = re.compile(r"[\s\.-]")
    for arg in argv:
        for word in splitter.split(arg):
            if word == '':
                continue
            original = word.lower()
            print(original + " -> " + stemmer.stemWord(original))

示例#48

0

显示文件

文件： experiment.py 项目： ktrnka/kaggle-home-depot

def preprocess_features(dataframe):
    # get the count of how many times each product appears, may correlate
    product_counts = pandas.DataFrame(pandas.Series(dataframe.groupby(["product_uid"]).size(), name="product_count"))
    dataframe = pandas.merge(dataframe, product_counts, left_on="product_uid", right_index=True, how="left")

    dataframe = experiment_gensim(dataframe)

    dataframe["search_length"] = dataframe.search_term.str.len()

    dataframe["id_bins"] = pandas.cut(dataframe.id, 20, labels=False)

    # word distribution metrics
    dataframe["title_unigram_overlap"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(1), axis=1)
    dataframe["title_bigram_overlap"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(2), axis=1)

    dataframe["desc_unigram_overlap"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(1), axis=1)
    dataframe["desc_bigram_overlap"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(2), axis=1)

    dataframe["brand_unigram_overlap"] = dataframe[["search_term", "brand_name"]].apply(make_ngram_match(1), axis=1)
    dataframe["brand_bigram_overlap"] = dataframe[["search_term", "brand_name"]].apply(make_ngram_match(2), axis=1)

    # stemmed unigrams
    stemmer = snowballstemmer.stemmer("english")
    dataframe["title_unigram_overlap_stemmed"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(1, stemmer=stemmer.stemWord), axis=1)
    dataframe["desc_unigram_overlap_stemmed"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(1, stemmer=stemmer.stemWord), axis=1)
    dataframe["title_bigram_overlap_stemmed"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(2, stemmer=stemmer.stemWord), axis=1)
    dataframe["desc_bigram_overlap_stemmed"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(2, stemmer=stemmer.stemWord), axis=1)


    # edit distance metrics (slow)
    dataframe["title_word_edit_distance"] = dataframe[["search_term", "product_title"]].apply(word_edit_distance, axis=1)
    dataframe["title_char_edit_distance"] = dataframe[["search_term", "product_title"]].apply(char_edit_distance, axis=1)
    # dataframe["desc_word_edit_distance"] = dataframe[["search_term", "product_description"]].apply(word_edit_distance, axis=1)
    # dataframe["desc_char_edit_distance"] = dataframe[["search_term", "product_description"]].apply(char_edit_distance, axis=1)

    dataframe = dataframe.drop(["product_title", "search_term", "id", "product_description", "brand_name"], axis=1)

    print(dataframe.describe())

    return dataframe

示例#49

0

显示文件

文件： summarizer.py 项目： Why-Not-Sky/wanish

def textrank(text, hdr):
    sent_tokenizer = PunktSentenceTokenizer()
    sentences = sent_tokenizer.tokenize(text)
    word_tokenizer = RegexpTokenizer(r'\w+')

    # finding out the most possible language of the text
    lang_code = lang_identifier.classify(' '.join([hdr, text]))[0]

    stemmer = snowballstemmer.stemmer(LANG_CODES.get(lang_code, 'english'))
    words = [set(stemmer.stemWord(word) for word in word_tokenizer.tokenize(sentence.lower()))
             for sentence in sentences]

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)

    g = nx.Graph()
    g.add_weighted_edges_from(scores)
    pr = nx.pagerank(g)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
                  key=lambda x: pr[x[0]], reverse=True), lang_code

示例#50

0

显示文件

文件： seeker.py 项目： lefcourn/django-seeker

def seeker_highlight(text, query, algorithm='english'):
    try:
        import snowballstemmer
        stemmer = snowballstemmer.stemmer(algorithm)
        stemWord = stemmer.stemWord
        stemWords = stemmer.stemWords
    except:
        stemWord = lambda word: word
        stemWords = lambda words: words
    phrases = _phrase_re.findall(query)
    keywords = [w.lower() for w in re.split(r'\W+', _phrase_re.sub('', query)) if w]
    highlight = set(stemWords(keywords))
    text = seeker_format(text)
    for phrase in phrases:
        text = re.sub('(' + re.escape(phrase) + ')', r'<em>\1</em>', text, flags=re.I)
    parts = []
    for word in re.split(r'(\W+)', text):
        if stemWord(word.lower()) in highlight:
            parts.append('<em>%s</em>' % word)
        else:
            parts.append(word)
    return ''.join(parts)

示例#51

0

显示文件

文件： preprocessing_alerts.py 项目： filrougestif2016/STIF_FILROUGE2016

def score_words_in_sentence(l_sentence, best_stops):
    """
    Define a score of relevance for each word in tweet
    We consider only non stop words, stemmed words either for tweet and stop name
    :param l_sentence: tweet split in list of words
    :param best_stops: list of more relevant stops
    :return: 1. list of kept/stemmed words found in tweet
             2. list of scores for each of these words
    """
    sb_stemmer = stemmer('french')
    stemmed_sentence = [sb_stemmer.stemWords([x])[0] for x in l_sentence]
    scores_stops = []
    tag_words = [0 for _ in l_sentence]
    relevant_stops = map(lambda x: unicodedata.normalize('NFD', x[0][0]).encode('ascii', 'ignore'), best_stops)
    rg_stop = 1
    for stop in relevant_stops:
        stop_w_index = []
        stop_lw = re.findall("\w+", stop, re.UNICODE)
        for w_stop in stop_lw:
            if not w_stop in fr_stop_words:
                stemmed_w = sb_stemmer.stemWords([w_stop])[0]
                if stemmed_w in stemmed_sentence:
                    stop_w_index.append(stemmed_sentence.index(stemmed_w))
        score_w = 0
        for i in range(len(stop_w_index)):
            if (i > 0) and (stop_w_index[i] <= stop_w_index[i-1]):
                score_w = 0
                break
            else:
                score_w += 1
        scores_stops.append(score_w)
        if score_w != 0:
            for idx in stop_w_index:
                if tag_words[idx] == 0:
                    tag_words[idx] = rg_stop
        rg_stop += 1
    return scores_stops, tag_words

示例#52

0

显示文件

文件： freqs_landscape.py 项目： lyltje/smrThesis

def get_frequencies(word_dict, date, max_date):
    stemmer = snowballstemmer.stemmer('dutch')    
    freqs = defaultdict(lambda: defaultdict(int))
    freqs_per_day = defaultdict(lambda: defaultdict(int))
    oneday = datetime.timedelta(1)

    # print("min", date, "\nmax", max_date)
    current_year = date.year
    print("current year", current_year)
    
    while date <= max_date:
        # print("current date", date)
        has_file = True
        if current_year!=date.year:
            current_year = date.year
            print("current year", current_year)
        try:
            f_in = open(input_path+r"\words"+str(date)+".txt", "r")
        except IOError:
            # print("File not found\n", input_path+r"\words"+str(date)+".txt")
            has_file = False
        if has_file:
            for line in f_in:
                line = line.replace("\n", "")
                line = line.split(";")
                word = line[0]
                freq = int(line[1])                
                if use_stemmer:
                    word = stemmer.stemWord(word)
                if word_dict[word][0]:
                    if log_correlation:
                        freqs[word][date] = freq
                    freqs_per_day[date][word] = freq                      
            f_in.close()
        date+=oneday
                            
    return freqs, freqs_per_day

示例#53

0

显示文件

文件： stemwords.py 项目： Marslo/VimConfig

def stemming(lang, input, output, encoding, pretty):
    result = []
    stemmer = snowballstemmer.stemmer(lang)
    for original in codecs.open(input, "r", encoding).readlines():
        original = original.strip()
        stemmed = stemmer.stemWord(original)
        if result:
            result.append('\n')
        if pretty == 0:
            if stemmed != "":
                result.append(stemmed)
        elif pretty == 1:
            result.append(original, " -> ", stemmed)
        elif pretty == 2:
            result.append(original)
            if len(original) < 30:
                result.append(" " * (30 - len(original)))
            else:
                result.append("\n")
                result.append(" " * 30)
            result.append(stemmed)
    outfile = codecs.open(output, "w", encoding)
    outfile.write(''.join(result) + '\n')
    outfile.close()

示例#54

0

显示文件

文件： ru.py 项目： 861008761/standard_flask_web

 def init(self, options):
     self.stemmer = snowballstemmer.stemmer('russian')

示例#55

0

显示文件

文件： commentProcess.py 项目： andoniVT/EmotionDetection

 def lemmatizer(self ,word):
     stemmer = snowballstemmer.stemmer('spanish');
     return stemmer.stemWord(word)

示例#56

0

显示文件

文件： it.py 项目： jdemeyer/sphinx

 def init(self, options):
     # type: (Any) -> None
     self.stemmer = snowballstemmer.stemmer('italian')

示例#57

0

显示文件

文件： identify_author.py 项目： 1anil21/Authorship-Recognition-using-Naive-Bayes

#!/usr/bin/python
# -*- coding: iso-8859-9 -*-

import argparse, os, re, sys, operator, math
import snowballstemmer

FILE_ENCODING = "windows-1254"
SMOOTHING_CONST = 0.1
STEMMING = True

# Create stemmer
stemmer = snowballstemmer.stemmer("turkish")

# prior_prob is a dictionary contains prior probabilities of authors
prior_prob = dict()

# word_prob is a dictionary contains dictionaries of authors which include word probabilities
word_prob = dict()

# Dictionary includes the number of words in the all training data of each author
total_words = dict()

# total_docs is the number of documents in training set
total_docs = 0

# authors is the list contains all author names
authors = []

# unknown word probabilities for each author
unknown_prob = dict()

示例#58

0

显示文件

文件： da.py 项目： jdemeyer/sphinx

 def init(self, options):
     # type: (Any) -> None
     self.stemmer = snowballstemmer.stemmer('danish')

示例#59

0

显示文件

文件： pt.py 项目： jdemeyer/sphinx

 def init(self, options):
     # type: (Any) -> None
     self.stemmer = snowballstemmer.stemmer('portuguese')

示例#60

0

显示文件

文件： utils.py 项目： Ne88ie/STC

# coding=utf-8
import re
import codecs
import cPickle as pickle
import pymorphy2
from snowballstemmer import stemmer

__author__ = 'annie'

__morph = pymorphy2.MorphAnalyzer()
__stemmer = stemmer('russian')
__pattern = re.compile(u'(?u)[A-zА-я]{2,}')

open_read = lambda file: codecs.open(file, encoding='utf-8', mode='r')
open_write = lambda file: codecs.open(file, encoding='utf-8', mode='w')


def str_dict(dict_):
    """
    Right conversion dict to string. Without nesting level of values.
    :param dict_: some dict
    :return: str
    """
    ans = [u'{0}: {1}'.format(k, v) for k, v in sorted(dict_.items())]
    return u'\n'.join(ans)


def print_dict(dict_):
    print(str_dict(dict_), u'\n\n{0} ключей'.format(len(dict_)))