def read_data_from_file( filename ): #"reading and preparing the training or testing datasets by extracting and separating tweets from labels." tweets = [] # list of text samples labels = [] # list of label ids labels_index = {} # dictionary mapping label name to numeric id istemmer = ISRIStemmer() read_file = open(filename, "r+") # read and write mode index = 0 for line in read_file: line = line.split('\t') # to get the tweet itself label = line[0] tweet = line[1].strip(" \"") tweet = clean_str(tweet) tweet = istemmer.norm(tweet) if (label not in labels_index): labels_index[label] = index index += 1 tweets.append(tweet) labels.append(labels_index[label]) read_file.close() return [tweets, labels]
def steamming(self): st = ISRIStemmer() lis = self.tokenizer() xx = "" for i in lis: xx = xx + ' ' + (st.stem(i)) return xx
def stemm(tweetstr): stemmer = ISRIStemmer(); stemstr = [] for s in tweetstr: st = stemmer.stem(s) stemstr.append(st) return stemstr
def read_data_from_file( filename, number_of_classes ): #"reading and preparing the training or testing datasets by extracting and separating tweets from labels." tweets = [] # list of text samples labels = [] # list of label ids istemmer = ISRIStemmer() read_file = open(filename, "r+") # read and write mode for line in read_file: tweet = "" filtered_line = line.split() # to get the tweet itself label = list(map(int, filtered_line[-11:])) for word in filtered_line[1:-11]: tweet += word + " " tweet = tweet[:-1] tweet = clean_str(tweet) tweet = istemmer.norm(tweet) tweets.append(tweet) labels.append(label) read_file.close() return [tweets, labels]
def finding_changeable_con(word, size): st = ISRIStemmer() stemmed_word = st.stem(word) if stemmed_word == word: for token in stem_dict: if token == word: print("Stemmed Word : " + token) for x in range(len(stem_dict[token])): derived_word = stem_dict[token][x] print("Derived Word : ") print(derived_word) print("Sentences : ") occurrences_list = word_dict[derived_word] concordances_output = get_changeable_con( occurrences_list, size) print(*concordances_output, sep="\n") else: for token in word_dict: if token == word: print("Word : " + token) print("Stemmed Word : " + stemmed_word) print("Sentences : ") occurrences_list = word_dict[token] concordances_output = get_changeable_con( occurrences_list, size) print(*concordances_output, sep="\n") print("\n") print("\n")
def data_preprocessing(article): article = re.sub('\n', ' ', article) # Removing this character article = re.sub('الـ', '', article) # Removing this character article = re.sub('لـ', '', article) # Removing this character article = re.sub('بـ', '', article) # Removing this character article = re.sub('ال', '', article) # Removing this character article = re.sub('عربية نت ', '', article) # Removing this sentence # Spilt the keyword name by comma tokens = word_tokenize(str(article)) # Define a list of punctuation remove_pun = str.maketrans('', '', string.punctuation) # Remove punctuation from each word words = [w.translate(remove_pun) for w in tokens] # Remove non-alphabetic characters alphabetic_words = [word for word in words if word.isalpha()] # Remove arabic stopwords alphabetic_words = [ word for word in alphabetic_words if not word in stop_words ] # Initialize arabic stemmer stemer = ISRIStemmer() # Stem each word stemmed_words = [stemer.suf32(word) for word in alphabetic_words] # Join and return the stemmed_words return " ".join(stemmed_words)
def stemm(tweetstr): stemmer = ISRIStemmer() stemstr = [] for s in tweetstr: st = stemmer.stem(s) stemstr.append(st) return stemstr
def stemming_ISR(self, text): st = ISRIStemmer() stemmed_words = [] words = word_tokenize(text) for w in words: stemmed_words.append(st.stem(w)) stemmed_text = " ".join(stemmed_words) return stemmed_text
def Stem_word(self, body): st = ISRIStemmer() word = body.split(u" ") word_stem = list() for w in word: word_stem.append(st.stem(w)) body = " ".join(word_stem) return body
def steaming(text): st = ISRIStemmer() stemmed_words = [] words = word_tokenize(text) for w in words: stemmed_words.append(st.stem(w)) stemmed_sentence = " ".join(stemmed_words) return stemmed_sentence
def stem( text ): #[st.stem(word) for word in text if not word in set(stopwords.words('english'))] st = ISRIStemmer() temp_text = "" for word in text.split(): #print(st.stem(word)) temp_text += st.stem(word) + " " text = temp_text return text
def one_string_steming(sentence): ''' Argument: String of words return: list of words with steming which the root of the word ''' sentence = one_string_tokenization(sentence) stemmer = ISRIStemmer() sentence = [stemmer.stem(word) for word in sentence] return sentence
def build_stem_dictionary(preprocessed_text, stop_words): # This method builds the Roots Dictionary as follows # {'stemmed_word1': ['derived_word1', 'derived_word2', ...], # 'stemmed_word2': ['derived_word1', 'derived_word2', 'derived_word3', ...], ...} st = ISRIStemmer() words_list = word_tokenize(preprocessed_text) for token in words_list: if token not in stop_words and token not in ['.']: stemmed_token = st.stem(token) if not stem_dict.get(stemmed_token): stem_dict[stemmed_token] = [] if not token in stem_dict[stemmed_token]: stem_dict[stemmed_token].append(token)
def arabic_social_media_text_filter(txt, debug=0): """ This filter is for filtering Arabic text from social media. :param txt: utf-8 text, unicode :param debug: Any value greater than 0 prints messages about normalized vs original text. :param return: """ txt = social_media_text_filter(txt, debug=debug) # Remove diacritics st = ISRIStemmer() txt = st.norm(txt) return txt
def stemLexicon(self, newLex): #newLex = prepareLexicon() stemmed_Lexicon_words = [] polarity_Lex = [] stLex = ISRIStemmer() for index, column in newLex.iloc[:].iterrows(): word = newLex.at[index, 'ngram'] polarity = newLex.at[index, 'polarity'] stemmed_Lexicon_words.append(stLex.stem(word)) polarity_Lex.append(polarity) stemmed_Lexicon_DF = pd.DataFrame({ 'ngram': stemmed_Lexicon_words, 'polarity': polarity_Lex }) return stemmed_Lexicon_DF #of type list
def tokenize_documents(documents): stop_words = stopwords.words('english') + stopwords.words('spanish') #common words to be filtered english = EnglishStemmer() arabic = ISRIStemmer() punctuation = { ord(char): None for char in string.punctuation} def valid_word(token, filtered=stop_words): # Returns false for common words, links, and strange patterns if (token in filtered) or (token[0:4] == u'http') or\ (token in string.punctuation): return False else: return True for doc in documents: row = doc[0] doc = doc[1] if doc is not None: # remove trailing whitespace doc = doc.strip() # remove twitter handles (words in doc starting with @) doc = re.sub(r"@\w+|\b@\w+", "", doc) # lowercase letters doc = doc.lower() # remove punctuation doc = doc.translate(punctuation) # tokenization: handles documents with arabic or foreign characters tokens = nltk.tokenize.wordpunct_tokenize(doc) cleaned_tokens = [] for token in tokens: # for valid words, correct spellings of gaddafi and stem words if valid_word(token): if token in [u'gadhafi', u'gadafi', u'ghadhafi', u'kadhafi', u'khadafi', u'kaddafi']: token = u'gaddafi' else: token = arabic.stem(english.stem(token)) cleaned_tokens.append(token) yield row yield cleaned_tokens
def get_test_negative_array_stemmed_without_sw(self): stemmer = ISRIStemmer() test_negative_array_stemmed_without_sw = [] review_words_stemmed_without_sw = [] for review in self.get_test_negative_array(self): review_words = nltk.word_tokenize(review) review_words_without_sw = [ i for i in review_words if not i in self.get_arabic_sw(self) ] review_words_stemmed_without_sw = [] for word in review_words_without_sw: review_words_stemmed_without_sw.append(stemmer.stem(word)) test_negative_array_stemmed_without_sw.append(" ".join( str(x) for x in review_words_stemmed_without_sw)) return test_negative_array_stemmed_without_sw
def stem(string): # split given string into words words = string.split() stems_list = [] isri_stemmer = ISRIStemmer() for word in words: # stem word stem_word = isri_stemmer.stem(word) # add new stem to dict stems_list.append(stem_word) return stems_list
def sentencePreprocessingDF(self, df, row, col): arabic_sw_file = open("arabic_stop_words.txt", 'r+') ar_sw_list = arabic_sw_file.read() ar_sw_list = word_tokenize(ar_sw_list) #Includes stopwords removal, elongtion words removal, Stemming st = ISRIStemmer() tokenized_word_list = [] tokenized_sentence = [] words = word_tokenize(df.at[row, col]) for word in words: if word not in ar_sw_list: word = self.replaceElongated(word) tokenized_word_list.append(st.stem(word)) tokenized_sentence = " ".join(tokenized_word_list) return tokenized_sentence
def __init__(self, category_id): self.category_id = category_id print('Books Class instantiated for Category {}.'.format(category_id)) # NLTK Stemmer self.st = ISRIStemmer() # get all stop words # individual letters (typos & printing issues) sw1 = get_stop_words('arabic') + stopwords.words("arabic") sw2 = [ 'ا', 'أ', 'إ', 'ذ', 'ض', 'ص', 'ث', 'ق', 'ف', 'غ', 'ع', 'ه', 'خ', 'ح', 'ج', 'ش', 'س', 'ي', 'ب', 'ل', 'ا', 'ال', 'ت', 'ن', 'م', 'ك', 'ئ', 'ء', 'ؤ', 'ر', 'لا', 'ى', 'ة', 'و', 'ز', 'ظ' ] self.sw = set(sw1 + sw2)
def stem(string, stemmer="porter", **kwargs): if stemmer == "porter": impl = PorterStemmer() elif stemmer == "lancaster": impl = LancasterStemmer() elif stemmer == "regex": regexp = kwargs['regexp'] if 'min' in kwargs: min = kwargs['min'] else: mins = 0 impl = RegexpStemmer(regexp=regexp, min=min) elif stemmer == "isri": impl = ISRIStemmer() elif stemmer == "snowball": if 'language' in kwargs: language = kwargs['language'] else: language = 'english' impl = SnowballStemmer(language=language) elif stemmer == "rslp": impl = RSLPStemmer() elif stemmer == "cistem": if 'case_insensitive' in kwargs: case_insensitive = kwargs['case_insensitive'] else: case_insensitive = False impl = Cistem(case_insensitive=case_insensitive) else: return string return impl.stem(string)
def get_features(comment, lan): words = list(comment) if lan == 'ar': st = ISRIStemmer() features = [0] * len(word_features_ar2) for w in words: w = st.stem(w) if w in word_features_ar_dict: features[word_features_ar_dict[w]] = 1 else: features = [0] * len(word_features_en2) for w in words: w = stem(w) if w in word_features_en_dict: features[word_features_en_dict[w]] = 1 return features
def text_stemming(self): """ stem the text """ if self.language == "french": stemmer = FrenchStemmer() elif self.language == "english": stemmer = PorterStemmer() elif self.language == "italian": stemmer = SnowballStemmer(self.language) elif self.language == "german": stemmer = SnowballStemmer(self.language) elif self.language == "spanish": stemmer = SnowballStemmer(self.language) elif self.language == "dutch": stemmer = SnowballStemmer(self.language) elif self.language == "portuguese": stemmer = SnowballStemmer(self.language) elif self.language == "danish": stemmer = SnowballStemmer(self.language) elif self.language == "greek": stemmer = GreekStemmer() elif self.language == "arabic": stemmer = ISRIStemmer() else: print( "Language need to be french, english, german,spanish or italian" ) self.text = ' '.join( [stemmer.stem(word) for word in word_tokenize(self.text)])
def __init__(self, file_name=None, lang=_SPANISH, stemming=False): """ Initializes the parameters for specific language """ self._text = os.getenv('TEXT', default='text') self.languages = [_SPANISH, _ENGLISH, _ARABIC] self.lang = lang if self.lang not in self.languages: raise ("Language not supported: " + lang) self.text_model = TextPreprocessing(lang=self.lang) self.stem = stemming if self.lang == _ENGLISH: self.stemmer = PorterStemmer() elif self.lang == _ARABIC: from nltk.stem.isri import ISRIStemmer self.stemmer = ISRIStemmer() else: self.stemmer = SnowballStemmer(self.lang) self.emotions = {} self.stem_emotions = {} if file_name is not None: emo_file = file_name else: if self.lang in [_ENGLISH, _ITALIAN, _GERMAN, _ARABIC]: emo_file = self.lang[:2] + "." + _AFFECTIVE_FILE elif self.lang == _SPANISH: emo_file = "es." + _AFFECTIVE_FILE emo_file = os.path.join(PATH, 'data', emo_file) self.load_emotions(emo_file)
class BasicStemmer(Stemmer): def __init__(self): self.stemmer = ISRIStemmer() self.stopWordsIndex = ArabicStopWordsIndex(self) self.stopWordsIndex.buildIndex() def getStems(self, tokens, flag=False): rootList = [] for token in tokens: #token=stemmer.norm(token) root = self.stemmer.pre32(token) rootList.append(root) print(token, " : ", root) return rootList def stem(self, word): root = self.stemmer.pre32(word) root = self.stemmer.norm(root, 3) return root def loadStemsDictionnary(self, filePath="dictStems.txt"): lines = open(filePath, "r", encoding="windows-1256").readlines() dictionary = nltk.defaultdict(list) for line in lines: if not re.match("^;.*", line): parts = line.split('\t') if len(parts) != 4: break else: [rootStem, stem, tag, enGloss] = parts dictionary[rootStem].append( [stem, tag, ' '.join(enGloss.split(';'))]) return dictionary def verify(self, word): if self.stopWordsIndex.access(word): return True def setStopWordsIndex(self, index: ArabicStopWordsIndex): self.stopWordsIndex = index self.stopWordsIndex.buildIndex()
def get_training_array_stemmed_without_sw(self): stemmer = ISRIStemmer() training_array_stemmed_without_sw = [] for review in self.get_positive_reviews_stemmed_without_sw(self): training_array_stemmed_without_sw.append((review, 'pos')) for review in self.get_negative_reviews_stemmed_without_sw(self): training_array_stemmed_without_sw.append((review, 'neg')) return training_array_stemmed_without_sw
class Stemming: def __init__(self): self.st = ISRIStemmer() def stemWord(self, text): word_tokens = word_tokenize(text) filtered_sentence = [self.st.stem(w) + ' ' for w in word_tokens] return ''.join(filtered_sentence)
def stem_tokens(token_list, src_lang): """ Returns the stem of a given word depending on the source language. """ stemmed = [] if src_lang == 'en': ps = PorterStemmer() for token in token_list: stemmed.append(ps.stem(token)) if src_lang == 'ar': isri = ISRIStemmer() for token in token_list: stemmed.append(isri.stem(token)) return stemmed
def WordsFiltires(tokenstem): """ :param tokenstem: :return WordsFiltires: """ stopWords = set(stopwords.words('arabic')) stemmed_word = [] WordsFiltires = [] words = word_tokenize(tokenstem) st = ISRIStemmer() for word in words: if word in stopWords: continue stemmed_word.append(st.stem(word)) WordsFiltires = ' '.join(stemmed_word) return WordsFiltires
def __init__(self, query, model, processed_corpus_path): self.model = model self.processed_corpus_path=processed_corpus_path self.query = query self.query_tokens=[] self.query_term_freq={} self.term_weights={} self.stemmer = ISRIStemmer() self.threshold = 0.005 self.top_res = 5 self.ar_stop_words=[] with open ("/home/tex/Documents/IR/Wikipedia-Search-Engine/project/rankretrievalmodel/Arabic/stop_words", 'r') as infile: self.ar_stop_words=[word[:-1] for word in infile.readlines()] self.tokenize() self.remove_stop_words() self.stem_tokens() self.term_freq() self.tfidf()
class Books(): def __init__(self, category_id): self.category_id = category_id print('Books Class instantiated for Category {}.'.format(category_id)) # NLTK Stemmer self.st = ISRIStemmer() # get all stop words # individual letters (typos & printing issues) sw1 = get_stop_words('arabic') + stopwords.words("arabic") sw2 = [ 'ا', 'أ', 'إ', 'ذ', 'ض', 'ص', 'ث', 'ق', 'ف', 'غ', 'ع', 'ه', 'خ', 'ح', 'ج', 'ش', 'س', 'ي', 'ب', 'ل', 'ا', 'ال', 'ت', 'ن', 'م', 'ك', 'ئ', 'ء', 'ؤ', 'ر', 'لا', 'ى', 'ة', 'و', 'ز', 'ظ' ] self.sw = set(sw1 + sw2) def not_sw(self, text): # excludes stop words return (text not in self.sw) or self.st.stem(text) not in self.sw def not_small_big( self, text): # exclude single letters, combined words, and stop words return (len(text) >= 3) and (len(text) <= 9) def get_book_id(self, index_url): return re.findall(r'13\d\\(\d+)', str(index_url))[0] def strip_text(self, text): return araby.strip_tatweel(araby.strip_tashkeel(text)) # This function is the main reason for having this class # Since Doc2Vec can take a `iter` to go through each file # one at a time, instead of loading all the books into memory. def __iter__(self): for i, file_name in enumerate( glob('../../data/' + str(self.category_id) + '/*.json')): print('Started Book: {}.'.format(self.get_book_id(file_name))) try: with open(str(file_name)) as f: book_text = json.load(f)['text'] #### Start Processing start_time = time.time() processed_book = araby.tokenize( self.strip_text(book_text), conditions=[self.not_sw, araby.is_arabicword]) print('Cleaned Book: {} in {} seconds.'.format( self.get_book_id(file_name), time.time() - start_time)) yield TaggedDocument(processed_book, [i]) except: print("Fix {}".format(file_name))
def basic_init(self, lang=_SPANISH, sentence_delim=False, **kwargs): if sentence_delim is False: self._BEGIN_TAG = "" self._END_TAG = "" self.lang = lang self.sentence_delim = sentence_delim logger.info("sws for {}".format(lang)) self.stopWords = self.get_stopwords(lang) self.tokenizer = TweetTokenizer() self.stemmer = None if self.lang in [_SPANISH, _ITALIAN, _PORTUGUESE]: self.stemmer = SnowballStemmer(_SPANISH, ignore_stopwords=False) elif self.lang == _ENGLISH: from nltk.stem.porter import PorterStemmer self.stemmer = PorterStemmer() elif self.lang == _ARABIC: from nltk.stem.isri import ISRIStemmer self.stemmer = ISRIStemmer()
def __init__(self, configFileName, stopWordsFileName, languageModelSerializationFileName, linksDBFileName, dataset): ''' Constructor ''' # The dataset to work on to extract the model self.dataset = [] # Term/Freq langauge model self.languageModel = {} self.languageModelFreqInfo = {} # Dict of stop words self.stopWords = {} # Store the dataset self.dataset = dataset # Initialize number of terms per label self.numTermsPerLabel = {} # Initialize the links DB self.linksDB = {} self.linksDBFileName = linksDBFileName # Parse the configurations file self.ParseConfigFile(configFileName) # Instanstiate the stemmer if stemming is enabled if self.enableStemming == "true": self.stemmer = ISRIStemmer() # Store the stop words self.UpdateStopWords(stopWordsFileName) # Store the serialization file self.languageModelSerializationFileName = languageModelSerializationFileName # Initialize total docs self.totalNumberOfDocs = len(self.dataset)
def lightStemAr(word_list): result = [] arstemmer = ISRIStemmer() for word in word_list: word = arstemmer.norm(word, num=1) # remove diacritics which representing Arabic short vowels if not word in arstemmer.stop_words: # exclude stop words from being processed word = arstemmer.pre32(word) # remove length three and length two prefixes in this order word = arstemmer.suf32(word) # remove length three and length two suffixes in this order word = arstemmer.waw(word) # remove connective ‘و’ if it precedes a word beginning with ‘و’ word = arstemmer.norm(word, num=2) # normalize initial hamza to bare alif result.append(word) return ' '.join(result)
def __init__(self): self.st = ISRIStemmer() self.getStopwords() self.getNegationwords() self.getSymbol()
class Preprocess: def __init__(self): self.st = ISRIStemmer() self.getStopwords() self.getNegationwords() self.getSymbol() def analysis(self,line): line = self.enLine(line) line = self.tokenize(line) line = self.remSW(line) line = self.getTerms(line) line = self.remNE(line) line = self.removeNA(line) line = self.asLine(line) return line def analysisList(self,line_list): newList = list() for line in line_list: line = self.enLine(line) line = self.tokenize(line) line = self.remSW(line) line = self.getTerms(line) line = self.remNE(line) line = self.removeNA(line) line = self.asLine(line) newList.append(line) return newList def getStopwords(self): '''get stopwords from the stopwords file''' module_dir = os.path.dirname(__file__) # get current directory file_path = os.path.join(module_dir, 'stopword.txt') f = open(file_path, 'r') stopwords = [line.rstrip() for line in f] sw = dict.fromkeys(stopwords) f.close() self.sw = [z.decode('utf-8') for z in sw] def getNegationwords(self): '''get negation words from the negation file''' module_dir = os.path.dirname(__file__) # get current directory file_path = os.path.join(module_dir, 'negation.txt') f = open(file_path, 'r') newords = [line.rstrip() for line in f] ne = dict.fromkeys(newords) f.close() self.ne = [n.decode('utf-8') for n in ne] def getSymbol(self): '''get symbol from symbol file''' module_dir = os.path.dirname(__file__) # get current directory file_path = os.path.join(module_dir, 'symbol.txt') f = open(file_path, 'r') sy = [line.rstrip() for line in f] ne = dict.fromkeys(sy) f.close() self.sy = [s.decode('utf-8') for s in sy] def enLine(self,line): ''' convert line to unicode ''' try: line = line.decode('utf-8') self.log_msg = "string is not UTF-8, length %d bytes" % len(line) except UnicodeError: self.log_msg = "string is UTF-8" for s in self.sy: try: s = s.decode('utf-8') except UnicodeError: log_msg = "string is UTF-8" line = line.replace(s, u' ' + s + u' ') #line = line.replace(u'.', u' . ') #line = line.replace(u'.', u' . ') return line def removeNA(self,token): '''remove non-Arabic''' #x = re.compile(ur'[\u064B-\u065F]+', re.UNICODE) #line = [x.sub('', word) for word in line] x = re.compile(ur'[^\u0621-\u064A|_]+[\u1F300-\u1F5FF\u1F600-\u1F64F\u1F680-\u1F6FF\u2600-\u26FF\u2700-\u27BF]+', re.UNICODE) token = [x.sub('', word) for word in token] x = re.compile(ur'[\u0023]+', re.UNICODE) token = [x.sub('', word) for word in token] token = [word for word in self.asLine(token).split()] return token def tokenize(self,line): if len(line) > 50000: n = len(line) / 50000 l = list() for i in range(1,n): start = (i-1)*50000 end = i * 50000 l = l + word_tokenize(line[start:end]) token = l else: token = word_tokenize(line) return token def remSW(self,token): token_clean = [x for x in token if x not in self.sw] return token_clean def remNE(self,token): for i in range(len(token)): if token[i] in self.ne: temp = token[i] for x in range(i+1,len(token)): if token[x] in self.sy: break else: token[x] = temp + '_' + token[x] token_clean = [x for x in token if x not in self.ne] token_clean = [x for x in token_clean if x not in self.sy] return token_clean def norma (self,word): if word[:2] == u'ال' : word = word[2:] #ألف x = re.compile(ur'[\u0622|\u0623|\u0625]+', re.UNICODE) word = x.sub(ur'\u0627', word) #ياء + ألف مقصورة x = re.compile(ur'[\u0649]+', re.UNICODE) word = x.sub(ur'\u064A', word) #تاء مربوطة + هاء x = re.compile(ur'[\u0629]+', re.UNICODE) word = x.sub(ur'\u0647', word) #تطويلة x = re.compile(ur'[\u0640]+', re.UNICODE) word = x.sub(ur'', word) return word def getTerms(self,token): line = list() for i in range(len(token)): a = self.norma(token[i]) a = self.st.stem(a) line.append(a) return line def asLine(self,token): return ' '.join(token)
import os import sys import json import io from nltk.stem.isri import ISRIStemmer # Make it work for Python 2+3 and with Unicode try: to_unicode = unicode except NameError: to_unicode = str # Read JSON file with open('golden_corpus/build/golden_corpus_arabic.json') as data_file: golden_corpus = json.load(data_file) stemmer = ISRIStemmer() i = cpt_roots = 0 stemmed = '' while(i < len(golden_corpus)- 2): r = stemmer.stem(golden_corpus[i]["word"]) if r == golden_corpus[i]["root"]: cpt_roots = cpt_roots + 1 i = i + 1 rootssSuccessPercent = (cpt_roots*100)/float(len(golden_corpus)) print "======================================================" print "================= Test ISRI-stemmer ==================" print "================= with Golden_Corpus ================" print "======================================================" print "success rate roots = {:0.2f} %".format(rootssSuccessPercent)
import nltk from nltk.stem.isri import ISRIStemmer case7p = [ "استبدلتموهم", "فلتستقبلوهم" ] case7 = [ "فلنبلونهم" ] if __name__ == "__main__": reload(sys) sys.setdefaultencoding('utf8') s = ISRIStemmer() nltk.data.path.append('/home/kariminf/Data/NLTK/') fout = open("isri_test.txt", "w") fout.write("it(\"Case of 7 chars\", function() {\n") for case in case7: print(case) fout.write(" expect(morpho.stem(\"" + case + "\")).to.eql(\"" + s.stem(case) + "\"));\n") fout.write("});\n") fout.write("it(\"Case of plus than 7 chars\", function() {\n") for case in case7p: print(case) fout.write(" expect(morpho.stem(\"" + case + "\")).to.eql(\"" + s.stem(case) + "\"));\n") fout.write("});\n")
def remove_diacritics(text): arstemmer = ISRIStemmer() result = arstemmer.norm(text, num=1) # remove diacritics which representing Arabic short vowels return result
def getRootAr(word_list): result = [] arstemmer = ISRIStemmer() for word in word_list: result.append(arstemmer.stem(word)) return ' '.join(result)