def steamming(self): st = ISRIStemmer() lis = self.tokenizer() xx = "" for i in lis: xx = xx + ' ' + (st.stem(i)) return xx
def stemm(tweetstr): stemmer = ISRIStemmer(); stemstr = [] for s in tweetstr: st = stemmer.stem(s) stemstr.append(st) return stemstr
def finding_changeable_con(word, size): st = ISRIStemmer() stemmed_word = st.stem(word) if stemmed_word == word: for token in stem_dict: if token == word: print("Stemmed Word : " + token) for x in range(len(stem_dict[token])): derived_word = stem_dict[token][x] print("Derived Word : ") print(derived_word) print("Sentences : ") occurrences_list = word_dict[derived_word] concordances_output = get_changeable_con( occurrences_list, size) print(*concordances_output, sep="\n") else: for token in word_dict: if token == word: print("Word : " + token) print("Stemmed Word : " + stemmed_word) print("Sentences : ") occurrences_list = word_dict[token] concordances_output = get_changeable_con( occurrences_list, size) print(*concordances_output, sep="\n") print("\n") print("\n")
def stemm(tweetstr): stemmer = ISRIStemmer() stemstr = [] for s in tweetstr: st = stemmer.stem(s) stemstr.append(st) return stemstr
def Stem_word(self, body): st = ISRIStemmer() word = body.split(u" ") word_stem = list() for w in word: word_stem.append(st.stem(w)) body = " ".join(word_stem) return body
def steaming(text): st = ISRIStemmer() stemmed_words = [] words = word_tokenize(text) for w in words: stemmed_words.append(st.stem(w)) stemmed_sentence = " ".join(stemmed_words) return stemmed_sentence
def stemming_ISR(self, text): st = ISRIStemmer() stemmed_words = [] words = word_tokenize(text) for w in words: stemmed_words.append(st.stem(w)) stemmed_text = " ".join(stemmed_words) return stemmed_text
def stemTokenize(text): if locale == 'ar': stemmer = ISRIStemmer() return [stemmer.stem(w) for w in word_tokenize(text)] elif locale == 'da': stemmer = SnowballStemmer('danish') return [stemmer.stem(w) for w in word_tokenize(text)] elif locale == 'en': stemmer = SnowballStemmer('english') return [stemmer.stem(w) for w in word_tokenize(text)] elif locale == 'es': stemmer = SnowballStemmer('spanish') return [stemmer.stem(w) for w in word_tokenize(text)] elif locale == 'hi': t = hindi_nlu.Processor(text) t.tokenize() return [t.generate_stem_words(w) for w in t.tokens] elif locale == 'mr': t = hindi_nlu.Processor(text) t.tokenize() return [t.generate_stem_words(w) for w in t.tokens] elif locale == 'nl': stemmer = SnowballStemmer('dutch') return [stemmer.stem(w) for w in word_tokenize(text)] elif locale == 'sv': stemmer = SnowballStemmer('swedish') return [stemmer.stem(w) for w in word_tokenize(text)] else: stemmer = SnowballStemmer('english') return [stemmer.stem(w) for w in word_tokenize(text)]
class Stemming: def __init__(self): self.st = ISRIStemmer() def stemWord(self, text): word_tokens = word_tokenize(text) filtered_sentence = [self.st.stem(w) + ' ' for w in word_tokens] return ''.join(filtered_sentence)
def stem( text ): #[st.stem(word) for word in text if not word in set(stopwords.words('english'))] st = ISRIStemmer() temp_text = "" for word in text.split(): #print(st.stem(word)) temp_text += st.stem(word) + " " text = temp_text return text
class Books(): def __init__(self, category_id): self.category_id = category_id print('Books Class instantiated for Category {}.'.format(category_id)) # NLTK Stemmer self.st = ISRIStemmer() # get all stop words # individual letters (typos & printing issues) sw1 = get_stop_words('arabic') + stopwords.words("arabic") sw2 = [ 'ا', 'أ', 'إ', 'ذ', 'ض', 'ص', 'ث', 'ق', 'ف', 'غ', 'ع', 'ه', 'خ', 'ح', 'ج', 'ش', 'س', 'ي', 'ب', 'ل', 'ا', 'ال', 'ت', 'ن', 'م', 'ك', 'ئ', 'ء', 'ؤ', 'ر', 'لا', 'ى', 'ة', 'و', 'ز', 'ظ' ] self.sw = set(sw1 + sw2) def not_sw(self, text): # excludes stop words return (text not in self.sw) or self.st.stem(text) not in self.sw def not_small_big( self, text): # exclude single letters, combined words, and stop words return (len(text) >= 3) and (len(text) <= 9) def get_book_id(self, index_url): return re.findall(r'13\d\\(\d+)', str(index_url))[0] def strip_text(self, text): return araby.strip_tatweel(araby.strip_tashkeel(text)) # This function is the main reason for having this class # Since Doc2Vec can take a `iter` to go through each file # one at a time, instead of loading all the books into memory. def __iter__(self): for i, file_name in enumerate( glob('../../data/' + str(self.category_id) + '/*.json')): print('Started Book: {}.'.format(self.get_book_id(file_name))) try: with open(str(file_name)) as f: book_text = json.load(f)['text'] #### Start Processing start_time = time.time() processed_book = araby.tokenize( self.strip_text(book_text), conditions=[self.not_sw, araby.is_arabicword]) print('Cleaned Book: {} in {} seconds.'.format( self.get_book_id(file_name), time.time() - start_time)) yield TaggedDocument(processed_book, [i]) except: print("Fix {}".format(file_name))
def one_string_steming(sentence): ''' Argument: String of words return: list of words with steming which the root of the word ''' sentence = one_string_tokenization(sentence) stemmer = ISRIStemmer() sentence = [stemmer.stem(word) for word in sentence] return sentence
def build_stem_dictionary(preprocessed_text, stop_words): # This method builds the Roots Dictionary as follows # {'stemmed_word1': ['derived_word1', 'derived_word2', ...], # 'stemmed_word2': ['derived_word1', 'derived_word2', 'derived_word3', ...], ...} st = ISRIStemmer() words_list = word_tokenize(preprocessed_text) for token in words_list: if token not in stop_words and token not in ['.']: stemmed_token = st.stem(token) if not stem_dict.get(stemmed_token): stem_dict[stemmed_token] = [] if not token in stem_dict[stemmed_token]: stem_dict[stemmed_token].append(token)
def tokenize_documents(documents): stop_words = stopwords.words('english') + stopwords.words('spanish') #common words to be filtered english = EnglishStemmer() arabic = ISRIStemmer() punctuation = { ord(char): None for char in string.punctuation} def valid_word(token, filtered=stop_words): # Returns false for common words, links, and strange patterns if (token in filtered) or (token[0:4] == u'http') or\ (token in string.punctuation): return False else: return True for doc in documents: row = doc[0] doc = doc[1] if doc is not None: # remove trailing whitespace doc = doc.strip() # remove twitter handles (words in doc starting with @) doc = re.sub(r"@\w+|\b@\w+", "", doc) # lowercase letters doc = doc.lower() # remove punctuation doc = doc.translate(punctuation) # tokenization: handles documents with arabic or foreign characters tokens = nltk.tokenize.wordpunct_tokenize(doc) cleaned_tokens = [] for token in tokens: # for valid words, correct spellings of gaddafi and stem words if valid_word(token): if token in [u'gadhafi', u'gadafi', u'ghadhafi', u'kadhafi', u'khadafi', u'kaddafi']: token = u'gaddafi' else: token = arabic.stem(english.stem(token)) cleaned_tokens.append(token) yield row yield cleaned_tokens
def stemLexicon(self, newLex): #newLex = prepareLexicon() stemmed_Lexicon_words = [] polarity_Lex = [] stLex = ISRIStemmer() for index, column in newLex.iloc[:].iterrows(): word = newLex.at[index, 'ngram'] polarity = newLex.at[index, 'polarity'] stemmed_Lexicon_words.append(stLex.stem(word)) polarity_Lex.append(polarity) stemmed_Lexicon_DF = pd.DataFrame({ 'ngram': stemmed_Lexicon_words, 'polarity': polarity_Lex }) return stemmed_Lexicon_DF #of type list
def stem(string): # split given string into words words = string.split() stems_list = [] isri_stemmer = ISRIStemmer() for word in words: # stem word stem_word = isri_stemmer.stem(word) # add new stem to dict stems_list.append(stem_word) return stems_list
def sentencePreprocessingDF(self, df, row, col): arabic_sw_file = open("arabic_stop_words.txt", 'r+') ar_sw_list = arabic_sw_file.read() ar_sw_list = word_tokenize(ar_sw_list) #Includes stopwords removal, elongtion words removal, Stemming st = ISRIStemmer() tokenized_word_list = [] tokenized_sentence = [] words = word_tokenize(df.at[row, col]) for word in words: if word not in ar_sw_list: word = self.replaceElongated(word) tokenized_word_list.append(st.stem(word)) tokenized_sentence = " ".join(tokenized_word_list) return tokenized_sentence
def get_test_negative_array_stemmed_without_sw(self): stemmer = ISRIStemmer() test_negative_array_stemmed_without_sw = [] review_words_stemmed_without_sw = [] for review in self.get_test_negative_array(self): review_words = nltk.word_tokenize(review) review_words_without_sw = [ i for i in review_words if not i in self.get_arabic_sw(self) ] review_words_stemmed_without_sw = [] for word in review_words_without_sw: review_words_stemmed_without_sw.append(stemmer.stem(word)) test_negative_array_stemmed_without_sw.append(" ".join( str(x) for x in review_words_stemmed_without_sw)) return test_negative_array_stemmed_without_sw
def get_features(comment, lan): words = list(comment) if lan == 'ar': st = ISRIStemmer() features = [0] * len(word_features_ar2) for w in words: w = st.stem(w) if w in word_features_ar_dict: features[word_features_ar_dict[w]] = 1 else: features = [0] * len(word_features_en2) for w in words: w = stem(w) if w in word_features_en_dict: features[word_features_en_dict[w]] = 1 return features
def stem_tokens(token_list, src_lang): """ Returns the stem of a given word depending on the source language. """ stemmed = [] if src_lang == 'en': ps = PorterStemmer() for token in token_list: stemmed.append(ps.stem(token)) if src_lang == 'ar': isri = ISRIStemmer() for token in token_list: stemmed.append(isri.stem(token)) return stemmed
def WordsFiltires(tokenstem): """ :param tokenstem: :return WordsFiltires: """ stopWords = set(stopwords.words('arabic')) stemmed_word = [] WordsFiltires = [] words = word_tokenize(tokenstem) st = ISRIStemmer() for word in words: if word in stopWords: continue stemmed_word.append(st.stem(word)) WordsFiltires = ' '.join(stemmed_word) return WordsFiltires
def stem_words(self): """ Stem all the words in each file using ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary. """ st = ISRIStemmer() for folder in os.listdir(self.processed_corpus_path): dir_path = os.path.join(os.sep, self.processed_corpus_path, folder) for a_file in os.listdir(dir_path): file_path = os.path.join(os.sep, dir_path, a_file) to_write = [] with open (file_path, 'r') as infile: words = infile.readlines() for word in words: to_stem = word[:-1] stemmed = st.stem(to_stem) to_write.append(stemmed) # print(to_write) with open (file_path, 'w') as outfile: for word in to_write: outfile.write(word+'\n') print(folder+" stemmed ")
def WordsFiltires(tokenstem): """ This function is to remove 1- remove stop words 2- stemmer :param tokenstem: :return WordsFiltires: """ stopWords = set(stopwords.words('arabic')) stemmed_word = [] WordsFiltires = [] words = word_tokenize(tokenstem) st = ISRIStemmer() # -----stop words with stemming----------- for word in words: if word in stopWords: continue stemmed_word.append(st.stem(word)) WordsFiltires = ' '.join(stemmed_word) return WordsFiltires
def GetProductsForSalary(x): s = False ps = PorterStemmer() words = word_tokenize(x) st = ISRIStemmer() length = len(words) # print(x) i = 0 while i < length: z = st.stem(words[i]) if re.search('\d', z): k = 0 while (k < len(z)): if z[k] == "ش": s = True print("vvvvvvvvvvvvvvvvvvvvvvvvv", z) elif z == "شيكل" or z == "شيقل" or z == "ش" or z == "NIS" or z == "Nis": s = True i += 1 print("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT====>", s) return s
def app_req_similarity(app_words, req_words): """ :param app_words: :param req_words: :return:Return the similarity (a score) between the request and a given app 2 approaches are used (with and without stemming) """ stemmer = ISRIStemmer() count_dict = {} stemmed_count_dict = {} # start calculating similarity for rw in req_words: if rw in app_words: if rw in count_dict: count_dict[rw] += 1 else: count_dict[rw] = 1 rw_stemmed = stemmer.stem(rw) if rw_stemmed in app_words: if rw_stemmed in stemmed_count_dict: stemmed_count_dict[rw_stemmed] += 1 else: stemmed_count_dict[rw_stemmed] = 1 # calculating score score = 0 stemmed_score = 0 for k in count_dict.keys(): score = score + int(count_dict[k]) score = score / len(req_words) for k in stemmed_count_dict.keys(): stemmed_score = stemmed_score + int(stemmed_count_dict[k]) stemmed_score = stemmed_score / len(req_words) print(score, stemmed_score) return score, stemmed_score
def clean_up_sentence(sentence): normalizeArabic(sentence) #using replacment dictionary to replace a popular words that clients may use for old, new in replacments.items(): sentence = sentence.replace(old, new) sentence = sentence.replace('؟', ' ') #tokenize the pattern tokens = word_tokenize(sentence) #remove punctuation from each word remove_pun = str.maketrans('', '', string.punctuation) words = [w.translate(remove_pun) for w in tokens] #remove non-alphabetic characters alphabetic_words = [word for word in words if word.isalpha()] #remove arabic stop stop arabic_stop_word = stopwords.words('arabic') stop_words = set(arabic_stop_word) alphabetic_words = [ word for word in alphabetic_words if not word in stop_words ] #stem each word stemer = ISRIStemmer() stemmed_words = [stemer.stem(word) for word in alphabetic_words] stemmed_words = list(dict.fromkeys(stemmed_words)) return stemmed_words
def main(): # Define which corpora to work with via sys.argv[1] corpora = sys.argv[1] # Define input data. k50 = "../out/mallet/testdez/" + corpora + "-50.txt" k100 = "../out/mallet/testdez/" + corpora + "-100.txt" k200 = "../out/mallet/testdez/" + corpora + "-200.txt" # Load ISRIStemmer. st = ISRIStemmer() # Create lists: all_plots, all_means. all_plots = [] all_means = [] # Create for loop over the three files. for i in (k50, k100, k200): # Open file, read it into variable f, close file. f_in = open(i) f = f_in.readlines() f_in.close() # Create lists: words, stemlist. words = [] stemlist = [] # Loop over the lines in f. Tokenize words, delete the numbers at the # beginning of each line (0:4). Append line to words. for line in f: line = tokenizer(line) del line[0:4] words.append(line) # Loop over words. Stem each word and append to stemlist. for listitem in words: stems = [] for w in listitem: r = st.stem(w) stems.append(r) stemlist.append(stems) # Create lists: score, plotdata. score = [] plotdata = [] # Loop over lists in stemlist. Create a dictionary: d. # Loop over the words in topic: # if word is in d: add 1 to its value in d. # else: add word to d. for topic in stemlist: d = {} for item in topic: if item in d: d[item] += 1 else: d[item] = 1 # Get the value of each word in d and append it to plotdata. maximum = max(d, key=d.get) plotdata.append(d[maximum]) # Calculate the score: 1 / len(d). # Append each d_score to score. d_score = 1 / len(d) score.append(d_score) # Calculate the mean of score. Append to all_means. mean = np.mean(score) all_means.append(mean) # Append plotdata to all_plots. all_plots.append(plotdata) print(plotdata) # Create figure: boxplot with data from "all_plots". xtick50 = "k=50, mean score over \n all topics: " + str( round(all_means[0], 4)) xtick100 = "k=100, mean score over \n all topics: " + str( round(all_means[1], 4)) xtick200 = "k=200, mean score over \n all topics: " + str( round(all_means[2], 4)) fig = plt.figure(1, figsize=(9, 6)) ax = fig.add_subplot(111) ax.boxplot(all_plots) ax.set_xticklabels([xtick50, xtick100, xtick200]) ax.set_yticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) ax.set_ylabel("Highest value of root repetition per topic", rotation='vertical') ax.set_xlabel("k = topics") ax.set_title("UN") fig.savefig('../out/mallet/figures/testdez/un.png', bbox_inches='tight')
class Preprocess: def __init__(self): self.st = ISRIStemmer() self.getStopwords() self.getNegationwords() self.getSymbol() def analysis(self,line): line = self.enLine(line) line = self.tokenize(line) line = self.remSW(line) line = self.getTerms(line) line = self.remNE(line) line = self.removeNA(line) line = self.asLine(line) return line def analysisList(self,line_list): newList = list() for line in line_list: line = self.enLine(line) line = self.tokenize(line) line = self.remSW(line) line = self.getTerms(line) line = self.remNE(line) line = self.removeNA(line) line = self.asLine(line) newList.append(line) return newList def getStopwords(self): '''get stopwords from the stopwords file''' module_dir = os.path.dirname(__file__) # get current directory file_path = os.path.join(module_dir, 'stopword.txt') f = open(file_path, 'r') stopwords = [line.rstrip() for line in f] sw = dict.fromkeys(stopwords) f.close() self.sw = [z.decode('utf-8') for z in sw] def getNegationwords(self): '''get negation words from the negation file''' module_dir = os.path.dirname(__file__) # get current directory file_path = os.path.join(module_dir, 'negation.txt') f = open(file_path, 'r') newords = [line.rstrip() for line in f] ne = dict.fromkeys(newords) f.close() self.ne = [n.decode('utf-8') for n in ne] def getSymbol(self): '''get symbol from symbol file''' module_dir = os.path.dirname(__file__) # get current directory file_path = os.path.join(module_dir, 'symbol.txt') f = open(file_path, 'r') sy = [line.rstrip() for line in f] ne = dict.fromkeys(sy) f.close() self.sy = [s.decode('utf-8') for s in sy] def enLine(self,line): ''' convert line to unicode ''' try: line = line.decode('utf-8') self.log_msg = "string is not UTF-8, length %d bytes" % len(line) except UnicodeError: self.log_msg = "string is UTF-8" for s in self.sy: try: s = s.decode('utf-8') except UnicodeError: log_msg = "string is UTF-8" line = line.replace(s, u' ' + s + u' ') #line = line.replace(u'.', u' . ') #line = line.replace(u'.', u' . ') return line def removeNA(self,token): '''remove non-Arabic''' #x = re.compile(ur'[\u064B-\u065F]+', re.UNICODE) #line = [x.sub('', word) for word in line] x = re.compile(ur'[^\u0621-\u064A|_]+[\u1F300-\u1F5FF\u1F600-\u1F64F\u1F680-\u1F6FF\u2600-\u26FF\u2700-\u27BF]+', re.UNICODE) token = [x.sub('', word) for word in token] x = re.compile(ur'[\u0023]+', re.UNICODE) token = [x.sub('', word) for word in token] token = [word for word in self.asLine(token).split()] return token def tokenize(self,line): if len(line) > 50000: n = len(line) / 50000 l = list() for i in range(1,n): start = (i-1)*50000 end = i * 50000 l = l + word_tokenize(line[start:end]) token = l else: token = word_tokenize(line) return token def remSW(self,token): token_clean = [x for x in token if x not in self.sw] return token_clean def remNE(self,token): for i in range(len(token)): if token[i] in self.ne: temp = token[i] for x in range(i+1,len(token)): if token[x] in self.sy: break else: token[x] = temp + '_' + token[x] token_clean = [x for x in token if x not in self.ne] token_clean = [x for x in token_clean if x not in self.sy] return token_clean def norma (self,word): if word[:2] == u'ال' : word = word[2:] #ألف x = re.compile(ur'[\u0622|\u0623|\u0625]+', re.UNICODE) word = x.sub(ur'\u0627', word) #ياء + ألف مقصورة x = re.compile(ur'[\u0649]+', re.UNICODE) word = x.sub(ur'\u064A', word) #تاء مربوطة + هاء x = re.compile(ur'[\u0629]+', re.UNICODE) word = x.sub(ur'\u0647', word) #تطويلة x = re.compile(ur'[\u0640]+', re.UNICODE) word = x.sub(ur'', word) return word def getTerms(self,token): line = list() for i in range(len(token)): a = self.norma(token[i]) a = self.st.stem(a) line.append(a) return line def asLine(self,token): return ' '.join(token)
from nltk.stem.isri import ISRIStemmer # Make it work for Python 2+3 and with Unicode try: to_unicode = unicode except NameError: to_unicode = str # Read JSON file with open('golden_corpus/build/golden_corpus_arabic.json') as data_file: golden_corpus = json.load(data_file) stemmer = ISRIStemmer() i = cpt_roots = 0 stemmed = '' while(i < len(golden_corpus)- 2): r = stemmer.stem(golden_corpus[i]["word"]) if r == golden_corpus[i]["root"]: cpt_roots = cpt_roots + 1 i = i + 1 rootssSuccessPercent = (cpt_roots*100)/float(len(golden_corpus)) print "======================================================" print "================= Test ISRI-stemmer ==================" print "================= with Golden_Corpus ================" print "======================================================" print "success rate roots = {:0.2f} %".format(rootssSuccessPercent) print cpt_roots," root cases are passed from: ",len(golden_corpus) print "======================================================" print "================= End Test ================" print "======================================================"
def stemer(self, word): stem = ISRIStemmer() root = stem.stem(word) return root
case7p = [ "استبدلتموهم", "فلتستقبلوهم" ] case7 = [ "فلنبلونهم" ] if __name__ == "__main__": reload(sys) sys.setdefaultencoding('utf8') s = ISRIStemmer() nltk.data.path.append('/home/kariminf/Data/NLTK/') fout = open("isri_test.txt", "w") fout.write("it(\"Case of 7 chars\", function() {\n") for case in case7: print(case) fout.write(" expect(morpho.stem(\"" + case + "\")).to.eql(\"" + s.stem(case) + "\"));\n") fout.write("});\n") fout.write("it(\"Case of plus than 7 chars\", function() {\n") for case in case7p: print(case) fout.write(" expect(morpho.stem(\"" + case + "\")).to.eql(\"" + s.stem(case) + "\"));\n") fout.write("});\n") fout.close()
import nltk from nltk import word_tokenize from nltk.stem.isri import ISRIStemmer st = ISRIStemmer() w = " البحث العلمي أو البحث أو التجربة التنموية هو أسلوب منظم في جمع المعلومات الموثوقة وتدوين الملاحظات والتحليل الموضوعي لتلك المعلومات باتباع أساليب ومناهج علمية محددة بقصد التأكد من صحتها أو تعديلها أو إضافة الجديد لها، ومن ثم التوصل إلى بعض القوانين والنظريات والتنبؤ بحدوث مثل هذه الظواهر والتحكم في أسبابها" for a in word_tokenize(w): print(st.stem(a))
def getRootAr(word_list): result = [] arstemmer = ISRIStemmer() for word in word_list: result.append(arstemmer.stem(word)) return ' '.join(result)
def stemTokenize(text): stemmer = ISRIStemmer() return [stemmer.stem(w) for w in word_tokenize(text)] # Created by pyminifier (https://github.com/liftoff/pyminifier)
import sys from nltk.stem.isri import ISRIStemmer arstemmer = ISRIStemmer() token = sys.argv[1] root = arstemmer.stem(token) print (root)
def stem(w): isri_stemmer = ISRIStemmer() return isri_stemmer.stem(w)
#!/usr/bin/env python # -*- coding: utf-8 -*- # # test_isri.py # # # from nltk.stem.isri import ISRIStemmer stemmer = ISRIStemmer() word = u"بمكتباتنا" stem = stemmer.stem(word) print stem.encode('utf8')