示例#1
0
 def porterStemmer(self):
     ps = stemmer.Stemmer()
     self.dataMailStem = []
     for d in self.dataMailSet:
         try:
             self.dataMailStem.append(ps.stem(d))
         except BaseException, Argument:
             print "error", Argument
示例#2
0
 def __init__(self, Stemming=False):
     self.stop_words = stopwords.words('english')
     extra_stop_words = ['i\'ll', 'i\'d', 'i\'m',
                         'i\'ve']  #expand the stopwords
     self.stop_words = self.stop_words + extra_stop_words
     self.Doc_ID = 0
     self.Stemmer = None
     if Stemming:
         self.Stemmer = stemmer.Stemmer()
示例#3
0
 def __init__(self):
     self.stop_words = stopwords.words('english')
     self.stop_words.extend([
         "rt", "n't", "'re", "gon", "na", "covid", "coronavirus", "covid-19"
     ])
     self.punctuation_to_remove = punctuation.replace('#', '').replace(
         '@', '').replace('%', '').replace('$', '')
     self.symbols = "<>:\"/\\|!?*~.'`-_()^,+=;"
     self.token_stemmer = stemmer.Stemmer()
示例#4
0
def virtualdictinit(list, vd):
    st = r'speller.db'
    conn = sqlite3.connect(os.path.join
                           (os.path.dirname
                            (os.path.abspath(__file__)), st))
    cursor = conn.cursor()
    st = stemmer.Stemmer()
    for l in list:
        cursor.execute("insert into {} values(?,?)".
                       format(vd), (st.stem(l), l))
    conn.commit()
    conn.close()
示例#5
0
def perform_operation():
    object = stemmer.Stemmer()
    object.stem_init(e1.get())
    e2.delete(0, END)
    e3.delete(0, END)

    try:
        object.hstem()
        e2.insert(0, object.rem)
        e3.insert(0, object.output)
    except:
        e2.insert(0, 'Not Found')
        e3.insert(0, 'Not Found')
示例#6
0
def stem_words(filteredText):
    stem = stemmer.Stemmer('english')
    keywords = []
    word = ""
    for token in filteredText:
        if not token.isdigit():
            if len(token) > 2:
                if token.isalpha():
                    st = stem.stemWord(token)
                    word += st.lower()
                if word:
                    keywords.append(word)
                    word = ""
    return keywords
示例#7
0
    def print_sorted_tfidf(self, sentence):
        porter = stemmer.Stemmer()
        stemed_sentence = []
        for word in porter.remove_symbol(sentence.lower()).replace("\n",
                                                                   "").split():
            stemed_sentence.append(porter.stem(word, 0, len(word) - 1))
        stemed_sentence = " ".join(stemed_sentence)

        sc_lst = self.calc_sent_tfidf(stemed_sentence)
        sc_lst = sorted(sc_lst.items(), key=(lambda x: x[1]), reverse=True)

        print "=" * 50
        print "input query: %s\n" % sentence
        print "stemed query: %s\n" % stemed_sentence
        print " [doc_path | tf-idf]"
        for doc, score in sc_lst[:5]:
            print " [%s | %f]" % (doc, score)
示例#8
0
文件: languageMode.py 项目: Luzzer/IR
    def __init__(self):

        self.porter = stemmer.Stemmer()
示例#9
0
def initiality_stem(wrd, initial):
    if not initial:
        st = stemmer.Stemmer()
        wrd = st.stem(wrd)
    return wrd
示例#10
0
#!/usr/bin/env python
# coding: utf-8

# In[1]:

import stemmer

# In[8]:

myStemmer = stemmer.Stemmer()
output = myStemmer.stemWord("ladkaa")
if (output == "ladka"):
    print("Function stemWord passed! ")

# In[9]:

output = myStemmer.stemListOfWords(["ladkii", "ladkaaaa", "firaaangii"])
if (output[0] == 'ladki' and output[1] == 'ladka' and output[2] == 'firangi'):
    print("Function stemListOfWords passed!")

# In[18]:

output = myStemmer.stem2dListOfWords([["merii", "merraa"],
                                      ["terii", "terraaa", "aaajjjaa"]])
if (output[0][0] == 'meri' and output[0][1] == 'mera'
        and output[1][0] == 'teri' and output[1][2] == 'aja'):
    print("Function stem2dListOfWords passed!")
示例#11
0
    unknown_processed_words = set()
    for token in text_tokens:
        if is_bg_word(checked_word=token, bul_words=bg_words):
            continue
        else:
            synonym = check_and_get_match_for_synonyms(
                checked_word=token, synonyms=foreign_synonyms)
            if synonym:
                words_suggestions[token] = synonym
            else:
                unknown_processed_words.add(token)
    return words_suggestions, list(unknown_processed_words)


if __name__ == '__main__':
    stem = stemmer.Stemmer()

    tokens = tokenizer.tokenize_text("assets/input.txt")
    tokens = list(set(tokens))
    print(f'Нашият входен текст съдържа {len(tokens)} значими думи')

    print('Зареждаме чуждиците и техните синоними ')
    foreign_synonyms = load_synonims("assets/synonyms.txt")
    print('Зареждаме корпусът с български думи')
    bg_words = load_bulgarian_words("assets/bg_words.txt")
    print('Зареждаме корпусът с английски думи')
    en_words = load_bulgarian_words("assets/en_words.txt")

    for word in foreign_synonyms.keys():
        value = foreign_synonyms[word]
        foreign_synonyms.pop(word)
import stemmer

stemming = stemmer.Stemmer()

def filter(s):
	s = s.lower()
	s = s.strip()
	s = rm_encoding(s)
	s = rm_punctuation(s)
	s = mv_tags(s)
	return s

def rm_encoding(s):
# Peregrine
# 	return s.decode('utf-8').encode('ascii', 'ignore')

# Hadoop
	return s.encode('ascii','ignore')


def rm_punctuation(s):
	s = s.encode('utf-8')
	s = s.translate(None, stemming.punctuation)
	return s

def mv_tags(tweet):
	import re
	_digits = re.compile('\d')
	words = tweet.split()
	for i, word in enumerate(words):
		# word = word.strip()
示例#13
0
class Translator:

    #text file containing all the tagalog-english translations
    __WORDS_DIR = os.path.dirname(
        os.path.realpath("translator")) + "\\trainingData\\tag-eng.txt"

    __tagalog_words = {}

    __stemmer = stemmer.Stemmer()

    def __init__(self):
        self.train()
        pass

    """
    method used to train the model
    """

    def train(self, tag_eng=__WORDS_DIR):
        freader = open(tag_eng, "r")
        contents = freader.readlines()
        freader.close()

        for line in contents:
            word_def = line.split(" : ")

            #definition is the always in the second index of word_def
            #replace remaining ":" if there is remaining
            defn = word_def[1].replace(":", "").strip()

            defn = defn.replace(word_def[0], "").strip()

            #remove the line's other transformation,
            #ex: ... (word1, word2, word3) ...
            defn = re.sub("[(].+?[)]", "", defn).strip()

            #regular expression to detect the tag for each entry
            tags_re = "n\.|adv\.|adj\.|v\.|intrj\.|comp\.|gram\.|conj\.|expr\.|prep\.|pref\.|imp\.|coll\.|interrog\.|idiom."

            #some pos tag cannot be found
            try:
                pos_tag = re.findall(tags_re, defn)[0]

                #remove pos tag, numberings and special characters
                defn = re.sub("[A-Za-z0-9]{1,10}[.],?|^!|[?!@.,]", "",
                              defn).strip()
                defn = re.sub("([/][A-Za-z]+? )|([/][A-Za-z]+?$)", "",
                              defn).strip()

                #split the different definitions, clean each of unneccessary whitespace
                #lowercase for consistency
                defn = [
                    self.clean_string(i).strip().lower()
                    for i in defn.split(";")
                ]

                #if the dictionary has already registered the word
                if self.__tagalog_words.has_key(word_def[0]):

                    #if the word-dictionary has already registered a specific pos tag
                    if self.__tagalog_words[word_def[0]].has_key(pos_tag):

                        #append it to the current
                        self.__tagalog_words[word_def[0]][pos_tag] += defn
                    else:

                        #initialize the list with defn
                        self.__tagalog_words[word_def[0]][pos_tag] = defn
                else:
                    self.__tagalog_words[word_def[0]] = {}
                    self.__tagalog_words[word_def[0]][pos_tag] = defn

            except:
                pass

    """
    *model should be trained first
    method used for tagalog translation, accepts a string word and a string pos_tag
    word is the word to be translated
    pos_tag is the pos tag of the word to be translated; by default it is ""
    returns a list of strings containing the english translations
    """

    def translate(self, word, pos_tag=""):

        #if the translation fails (dictionary lookup), stem it
        try:
            #if the pos tag is unspecified
            if pos_tag == "" or pos_tag == "AMB" or pos_tag == "UNK":

                #initialize the translations container
                translations = []

                #append all translations, regardless of pos tag
                for key in self.__tagalog_words[word].keys():
                    translations += self.__tagalog_words[word][key]

                return translations
            else:
                if pos_tag.lower() + "." in self.__tagalog_words[word].keys():
                    #return translation for a specific pos tag
                    return self.__tagalog_words[word][pos_tag.lower() + "."]
                elif self.__tagalog_words[word].keys() > 0:
                    return self.translate(word)

        #if the translation errors due to an index not found
        except:
            try:
                if self.stem2x(word) == word:
                    return word
                return self.translate(self.stem2x(word)) + ["~"]
            except:
                return []

    def stem2x(self, word):
        word = self.__stemmer.stem(word)
        return self.__stemmer.stem(word)

    #remove non-alphabet characters
    def clean_string(self, word):
        return re.sub("[^A-Za-z0-9 ]", "", word)
示例#14
0
class Translator:

    #text file containing all the tagalog-english translations
    __WORDS_DIR = os.path.dirname(
        os.path.realpath("translator")) + "\\trainingData\\tag-eng.txt"

    __tagalog_words = {}

    __stemmer = stemmer.Stemmer()

    def __init__(self):
        self.train()
        pass

    def train(self, tag_eng=__WORDS_DIR):
        freader = open(tag_eng, "r")
        contents = freader.readlines()
        freader.close()
        for line in contents:
            word_def = line.split(" : ")

            #definition is the always in the second index of word_def
            #replace remaining ":" if there is remaining
            defn = word_def[1].replace(":", "").strip()
            """tag = re.findall("(adj[.])|(v[.])|(n[.])|(adv[.])|(conj[.])|(prep[.])",defn[1])[0]
            try:
                tag = re.findall("(adj[.])|(v[.])|(n[.])|(adv[.])|(conj[.])|(prep[.])",defn[1])[0]
            except:
                print defn"""
            defn = defn.replace(word_def[0], "").strip()

            #remove the line's other transformation,
            #ex: ... (word1, word2, word3) ...
            defn = re.sub("[(].+?[)]", "", defn).strip()

            #seperate different translations; synonyms
            defn = defn.split(";")

            #remove POS tags like n., v., inf., and numberings 1. 2. ...
            for index in range(len(defn)):

                defn[index] = re.sub("[A-Za-z0-9]{1,4}[.],?", "",
                                     defn[index]).strip()
                defn[index] = re.sub("([/][A-Za-z]+? )|([/][A-Za-z]+?$)", "",
                                     defn[index])

            #if word is in dictionary, then add the definitions/translations
            if word_def[0] in self.__tagalog_words:
                for definition in defn:
                    self.__tagalog_words[word_def[0]].append(definition)
            #if not, then create a new entry, then add the definitions/translations
            else:
                self.__tagalog_words[word_def[0]] = []
                for definition in defn:
                    self.__tagalog_words[word_def[0]].append(definition)

    def translate(self, word, pos_tag=""):
        try:
            return self.__tagalog_words[word]
        except:
            try:
                return self.__tagalog_words[self.__stemmer.stem(word)]
            except:
                return []
示例#15
0
 def __init__(self):
     self.stop_words = stopwords.words('english')
     self.token_stemmer = stemmer.Stemmer()