def stemming_and_lemmatization(token): stemmer = Stemmer() lemmatizer = Lemmatizer() stemmed = stemmer.stem(token) lemmatized = lemmatizer.lemmatize(stemmed) return lemmatized
def score(self, sentences): # Predict pos, neg, neu = 0, 0, 0 stemmer = Stemmer() classifier = self.__get_model() normalizer = Normalizer() sentences = sent_tokenize(sentences) for sentence in sentences: sentence = normalizer.normalize(sentence) words = word_tokenize(sentence) for word in words: stemmer.stem(word) class_result = classifier.classify(self.__word_feats(word)) if class_result == 'neg': neg = neg + 1 if class_result == 'pos': pos = pos + 1 if class_result == 'neu': neu = neu + 1 positive_sentiment = str(float(pos) / len(words)) # print('Positive: ' + positive_sentiment) neutral_sentiment = str(float(neu) / len(words)) # print('Neutral: ' + neutral_sentiment) negative_sentiment = str(-float(neg) / len(words)) # print('Negative: ' + negative_sentiment) total_sentiment = (float(positive_sentiment)+float(negative_sentiment)) / 2 # print('Total (Avg): ' + str(total_sentiment)) return total_sentiment
def get_stemmer(self, document): ''' Stemmer ''' content = self.clear_document(document) result = self.split_document(content) stemmer = Stemmer() word_stems = [(item, stemmer.stem(item)) for item in result] return word_stems
def __init__(self, inFile, outFile): self.inFile = inFile self.outFile = outFile self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.lemmatizer = Lemmatizer() self.stemmer = Stemmer()
def stem(target_string): stemmed_string = "" stemmer = Stemmer() for single_word in target_string.split(): stemmed_string += stemmer.stem(single_word) + " " return stemmed_string
def stem(self): """ :return: """ stemmer = Stemmer() for words in self.words: temp = [] for word in words: temp.append(stemmer.stem(str(word))) self.stem_words.append(temp) return self.stem_words
def __init__(self, component_config: Dict[Text, Any] = None) -> None: super().__init__(component_config) if self.component_config.stemmer: self._stemmer = Stemmer() if self.component_config.lemmatizer: self._lemmatizer = Lemmatizer() if self.component_config.pos: self._pos_tagger = POSTagger(model='resources/postagger.model')
def preprocess(doc): stemmer = Stemmer() lemmatizer = Lemmatizer() normalizer = Normalizer() doc = normalizer.normalize(doc) tokenized = re.split(' |-', doc) for w in tokenized[:]: if w in stopwords: tokenized.remove(w) stemmed = [stemmer.stem(w) for w in tokenized] new_words = [word for word in stemmed if word.isalnum()] lemmatized = [lemmatizer.lemmatize(w) for w in new_words] return lemmatized
def read_stops(): #read stopwords file stops = [] with codecs.open('stopwords.txt', encoding='utf-8') as reader: stops = set(reader.read().split(os.linesep)) stemmer = Stemmer() return stops, stemmer
def prepare(): normalizer = Normalizer() stemmer = Stemmer() string = '''ویکی پدیای انگلیسی در تاریخ ۱۵ ژانویه ۲۰۰۱ (۲۶ دی ۱۳۷۹) به صورت مکملی برای دانشنامهٔ تخصصی نیوپدیا نوشته شد. بنیان گذاران آن «جیمی ویلز» و «لری سنگر» هستند. هم اکنون بنیاد غیرانتفاعی ویکی مدیا پروژهٔ ویکی پدیا را پشتیبانی می کند. میزبان های اینترنتی اصلی این وبگاه در شهر تامپای فلوریدا هستند؟ همچنین میزبان های اضافی دیگری هم در شهرهای آمستردام، شیراز و سئول به این وبگاه یاری می رسانند؟''' tokenizer = WordTokenizer(join_verb_parts=True, separate_emoji=True, replace_links=True, replace_IDs=True, replace_emails=True, replace_numbers=True, replace_hashtags=True) labels = {'،': 'COMMA', '.': 'DOT', '؟': 'QMARK'} normal_string = normalizer.normalize(string) for label in labels.keys(): print(normal_string.find(label)) exit(0) for i, sent in enumerate([1, 2, 3, 4]): entities = [] (10, 15, 'PrdName') for label in labels.keys(): print(f'{label} in {i}', label in sent) record = (sent, {'entities': entities}) print()
def doc_stemmer(doc): stem_doc_list=[] stemmer=Stemmer() for i in range(len(doc)): stem_doc_list.append([]) for i in range(len(doc)): for j in range(len(doc[i])): stem_doc_list[i].append([]) for i in range(len(doc)): for j in range(len(doc[i])): for z in range(len(doc[i][j])): stem_doc_list[i][j].append(stemmer.stem(doc[i][j][z])) return stem_doc_list
class PersianTextPreProcessor: def __init__(self): self.stemmer = Stemmer() self.normalizer = Normalizer() self.punctuations = string.punctuation def process_single_word(self, word): word = word.lower() word = re.sub('\d+', '', word) word = word.translate( str.maketrans(self.punctuations, ' ' * len(self.punctuations))) word = ' '.join( re.sub(r'[^ضصثقفغعهخحجچشسیبلاتنمکگظطزرذدپوئژآؤ \n]', ' ', word).split()) word = word.strip() word = self.normalizer.normalize(word) word = self.stemmer.stem(word) return word def pre_stopword_process(self, text): # text = self.persian_text_cleaner.get_sentences(text) text = text.lower() text = re.sub('\d+', '', text) text = text.translate( str.maketrans(self.punctuations, ' ' * len(self.punctuations))) text = ' '.join( re.sub(r'[^ضصثقفغعهخحجچشسیبلاتنمکگظطزرذدپوئژآؤ \n]', ' ', text).split()) text = text.strip() normalized_text = self.normalizer.normalize(text) words = word_tokenize(normalized_text) words = [w for w in words if w != '.'] return words def clean_text(self, text, stopwords, remove_stopwords=True, stem=True): words = self.pre_stopword_process(text) if remove_stopwords: words = [w for w in words if w not in stopwords] if stem: words = [self.stemmer.stem(w) for w in words] return words def stem(self, words): words = [self.stemmer.stem(w) for w in words] return words
def TextCleaner(self): self.stopwordsList= '' Data = self.imported_data stemmer = Stemmer() lemmatizer = Lemmatizer() dataList = Data table = str.maketrans('', '', punctuation) for i in range(0, len(dataList)): for j in range(0, len(dataList[i][0])): dataList[i][0][j] = stemmer.stem(dataList[i][0][j]) dataList[i][0][j] = lemmatizer.lemmatize(dataList[i][0][j]) dataList[i][0] = [word for word in dataList[i][0] if word.isalpha()] dataList[i][0]= [w.translate(table) for w in dataList[i][0]] dataList[i][0] = [word for word in dataList[i][0] if len(word) > 3] self.imported_data = dataList return self.imported_data
def perform_word_stemming(data_dict): from hazm import Stemmer stemmer = Stemmer() return_value = {} for folder_name in data_dict.keys(): return_value[folder_name] = {} for file_name in data_dict[folder_name].keys(): this_files_words = [] for sent_text in data_dict[folder_name][file_name]: this_sentences_words = [] for word in sent_text: lemma_word = stemmer.stem(word) this_sentences_words.append(lemma_word) this_files_words.append(this_sentences_words) return_value[folder_name][file_name] = this_files_words return return_value
def prepare_text(text, should_stem=True): normalizer = Normalizer() text = normalizer.normalize(text) tokenized = word_tokenize(text) #نگارشی def fix_word(w): # for c in Text_cleaner.punct_list: # w = w.replace(c, '') w = re.sub(Text_cleaner.punct_regex, '', w).replace('،', '') return "$" if w == "" else w punc_free = list(filter(lambda x: x != '$', map(fix_word, tokenized))) stemmer = Stemmer() if should_stem: stemmed_list = list( filter(lambda x: x != '', map(stemmer.stem, punc_free))) else: stemmed_list = punc_free return stemmed_list
class HazmTokenizer(Component): defaults = {"stemmer": True, "lemmatizer": True, 'pos': False} def __init__(self, component_config: Dict[Text, Any] = None) -> None: super().__init__(component_config) if self.component_config.stemmer: self._stemmer = Stemmer() if self.component_config.lemmatizer: self._lemmatizer = Lemmatizer() if self.component_config.pos: self._pos_tagger = POSTagger(model='resources/postagger.model') def required_packages(self) -> List[Text]: return ['hazm'] def process(self, message: Message, **kwargs: Any) -> None: text = message.text for sentence_str in sent_tokenize(text): sentence = Sentence(sentence_str) tokens = word_tokenize(sentence_str) pos_tags = [] if self.component_config.pos: pos_tags = self._pos_tagger.tag(tokens) for idx, token_str in enumerate(tokens): token = Token(text=token_str) if self.component_config.stemmer: token[TOKEN_ATTRIBUTE_STEM] = self._stemmer.stem(token_str) if self.component_config.lemmatizer: token[TOKEN_ATTRIBUTE_LEMM] = self._lemmatizer.lemmatize( token_str) if self.component_config.pos: token[TOKEN_ATTRIBUTE_POS] = pos_tags[idx][1] sentence.add_token(token) message.add_sentence(sentence)
def stemmer(tweets): stemmer_tweets = [] for tweet in tweets: stemmer_tweets.append(Stemmer().stem(tweet)) return stemmer_tweets
from hazm import Normalizer, word_tokenize, Stemmer, WordTokenizer, stopwords_list import re # text preparation from Phase1 import wiki_dump_parser_ stemmer = Stemmer() normalizer = Normalizer() tokenizer = WordTokenizer(separate_emoji=True, replace_links=True, replace_IDs=True, replace_emails=True, replace_hashtags=True, replace_numbers=True) tokenizer.number_int_repl = '.' tokenizer.number_float_repl = '.' tokenizer.email_repl = '.' tokenizer.hashtag_repl = '.' tokenizer.id_repl = '.' tokenizer.emoji_repl = '.' tokenizer.link_repl = '.' punctuations = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`؟،{|}~""" # 10 points def prepare_text(text): text = text.lower() text = re.sub('\d+', '', text) text = text.translate(str.maketrans(punctuations, ' ' * len(punctuations))) text = ' '.join( re.sub(r'[^ضصثقفغعهخحجچشسیبلاتنمکگظطزرذدپوئژآؤ \n]', ' ',
def get_literal(x): lit = literal_eval(x) subs = [] for key in lit.keys(): subs.append((key,list(lit[key]['sub_category'].keys()))) return subs def try_join(x): try: l = [stemmer.stem(w) for w in literal_eval(x) if w not in stopwords] return ' '.join(l) except: return np.nan # farsi stemmer and stopwords from hazm stemmer = Stemmer() stopwords = stopwords_list() print("Processing words...") # remove stopwords and stem words in wikipedia corpus with open('datasets/wiki.txt', 'r') as f: wiki = f.readlines() words = [w.split(' ') for w in wiki] words = [item for sublist in words for item in sublist] words = np.unique(words) words = np.fromiter((stemmer.stem(xi) for xi in words if xi not in stopwords), words.dtype) # fit count vectorizer on wikipedia corpus count_vect = CountVectorizer(ngram_range=(1,2))
def __init__(self): self.stemmer = Stemmer() self.normalizer = Normalizer() self.punctuations = string.punctuation
} class EnglishPunctuationSet: def __contains__(self, item): return not item.isalpha() LANGUAGE_PUNCTUATION = { FA: '*\.:!،؛؟»\]\)\}«\[\(\{', EN: EnglishPunctuationSet(), } LANGUAGE_STEMMER = { FA: Stemmer().stem, EN: PorterStemmer().stem, } LANGUAGE_NORMALIZER = { EN: lambda x: x.lower(), FA: Normalizer().normalize, } LANGUAGE_TOKENIZER = { FA: WordTokenizer().tokenize, EN: word_tokenize, } def remove_stop_words(tokens, language):
def stemming(self, tokens): stemmer = Stemmer() return [stemmer.stem(token) for token in tokens]
row_sums = confusion_matrix.sum(1) # recall = ( # confusion_matrix.diagonal()[row_sums.nonzero()] / # row_sums[row_sums.nonzero()]).sum() / len(row_sums[row_sums.nonzero()]) #print labels #print confusion_matrix return precision if __name__ == '__main__': rd = HamshahriReader(config.corpora_root) counter = Counter() docs = [] normalizer = Normalizer() stemmer = Stemmer() for doc in rd.docs(count=config.documents_count): doc['text'] = normalizer.normalize(doc['text']) doc['words'] = [stemmer.stem(word) for word in word_tokenize(doc['text'])] counter.update([doc['cat']]) docs.append(doc) print counter all_words = [] for doc in docs: all_words.extend(doc['words']) dist = nltk.FreqDist(word for word in all_words) word_features = dimension_reduction(all_words, dist) print len(word_features) / float(len(all_words)) * 100.0
# -*- coding: utf8 -*- import numpy as np from hazm import Stemmer from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import TfidfVectorizer persian_stemmer = Stemmer() class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(StemmedTfidfVectorizer, self).build_analyzer() return lambda doc: (persian_stemmer.stem(w) for w in analyzer(doc)) count_vect = StemmedTfidfVectorizer(stop_words='persian') train_data = [ 'پایتون زبان برنامه نویسی خوبی است', 'لینوکس یک سیستم عامل است', 'گیاهان دارویی را باید استفاده کرد', 'لینوکس یک سیستم عامل متن باز است', 'پایتون زبان مناسبی برای یادگیری ماشینی است' ] target = np.array([1, 2, 3, 2, 1]) train_counts = count_vect.fit_transform(train_data) clf = MultinomialNB().fit(train_counts, target) test_data = [ 'با پایتون میتوان در لینوکس برنامه نویسی کرد', 'من لینوکس را دوست دارم'
class Preprocessor: normalizer = Normalizer() stemmer = Stemmer() lemmatizer = Lemmatizer() tokenizer = WordTokenizer() stop_words = stopwords_list() @staticmethod def remove_noise(text: str) -> str: return Preprocessor.__remove_punctuation( Preprocessor.__remove_emojis(text)) @staticmethod def remove_stop_words(tokens: List) -> str: return [t for t in tokens if t not in Preprocessor.stop_words] @staticmethod def __remove_emojis(text: str): emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u'\U00010000-\U0010ffff' u"\u200d" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u23cf" u"\u23e9" u"\u231a" u"\u3030" u"\ufe0f" "]+", flags=re.UNICODE) first_cleaned_text = emoji_pattern.sub(r'', text) # no emoji return emoji.get_emoji_regexp().sub(r'', first_cleaned_text) @staticmethod def __remove_punctuation(text: str): try: return re.sub( r'[\.\?\!\,\:\;\،\(\)\؛\#\%\^\&\$\~\'\"\×\-\_\*\>\<\+\=\\\/]', '', text) except TypeError as e: print(e, text) @staticmethod def normalize(text: str) -> str: return Preprocessor.normalizer.normalize(text) @staticmethod def stem(word: str) -> str: return Preprocessor.stemmer.stem(word) @staticmethod def lemmatize(word: str) -> str: return Preprocessor.lemmatizer.lemmatize(word) @staticmethod def tokenize(text: str) -> str: return Preprocessor.tokenizer.tokenize(text) @staticmethod def preprocess(text: str) -> str: cleaned_text = Preprocessor.remove_noise(str(text)) normalized_text = Preprocessor.normalize(cleaned_text) tokens = Preprocessor.tokenize(normalized_text) none_stop_words = Preprocessor.remove_stop_words(tokens) # stems = [Preprocessor.stem(w) for w in tokens] lemmatized = [Preprocessor.lemmatize(w) for w in none_stop_words] return ' '.join(lemmatized)
class POS(): def __init__(self, inFile, outFile): self.inFile = inFile self.outFile = outFile self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.lemmatizer = Lemmatizer() self.stemmer = Stemmer() def posTaggerTXT(self): with open(self.outFile, 'w', encoding="utf8") as o: with open(self.inFile, 'r', encoding="utf8") as f: line = f.readline() while line: line = line.strip() line = self.normalizer.normalize(line) tags = self.tagger.tag(word_tokenize(line)) for li in tags: t = '{:20s} {:20s} {:20s} {:20s}\n'.format( li[0], self.nameTag(li[1]), self.lemmatizer.lemmatize(li[0]), self.stemmer.stem(li[0])) o.write(t) line = f.readline() def posTaggerHTML(self): with open(self.outFile, 'w', encoding="utf8") as o: with open(self.inFile, 'r', encoding="utf8") as f: o.write(self.preHTML()) line = f.readline() while line: line = line.strip() line = self.normalizer.normalize(line) tags = self.tagger.tag(word_tokenize(line)) for li in tags: t = '{:s} -//- {:s} -//- {:s} -//- {:s}\n'.format( li[0], self.nameTag(li[1]), self.lemmatizer.lemmatize(li[0]), self.stemmer.stem(li[0])) o.write(self.divHTML(self.colorTag(li[1]), t)) o.write("\n") line = f.readline() o.write(self.posHTML()) def nameTag(self, tag): if tag == "V": return "فعل" elif tag == "N": return "اسم" elif tag == "ADV": return "قید" elif tag == "PRO": return "ضمیر" elif tag == "PUNC": return "نشانه نگارشی" elif tag == "Ne": return "غیر قابل تشخیص" elif tag == "NUM": return "عدد" elif tag == "CONJ": return "حرف ربط" elif tag == "POSTP": return "نشانه مفعولی" elif tag == "P": return "حرف اضافه" elif tag == "AJ": return "صفت" elif tag == "DET": return "ضمیر اشاره" else: return tag def colorTag(self, tag): if tag == "V": return "red" elif tag == "N": return "hotpink" elif tag == "ADV": return "blue" elif tag == "PRO": return "gold" elif tag == "PUNC": return "lightblue" elif tag == "Ne": return "darkgray" elif tag == "NUM": return "white" elif tag == "CONJ": return "lightgreen" elif tag == "POSTP": return "white" elif tag == "P": return "aqua" elif tag == "AJ": return "teal" elif tag == "DET": return "slateblue" else: return "white" def preHTML(self): return """<!DOCTYPE html> <head> <meta charset="UTF-8"> </head> <body> """ def posHTML(self): return """ </body> </html>""" def divHTML(self, color, text): return """ <div style="background-color:""" + color + """"> """ + """<h4>""" + text + """</h4> """ + """</div>