Exemplo n.º 1
0
def stem_data(dat):
    normalizer = hazm.Normalizer()
    dat = normalizer.normalize(dat)
    sent = hazm.sent_tokenize(dat)

    words = []

    for s in sent:
        tagged = list(tagger.tag(hazm.word_tokenize(s)))
        new_tag = list(tagged)

        for token in tagged:
            if token[0] in stop_words:
                new_tag.remove(token)

        lemmatizer = hazm.Lemmatizer()
        for token in new_tag:

            stemmed = lemmatizer.lemmatize(token[0], pos=token[1])
            stemmer = hazm.Stemmer()
            stemmed = stemmer.stem(stemmed)
            if len(stemmed) > 0 and ('#' not in stemmed):
                words.append(stemmed)

    return words
Exemplo n.º 2
0
def prepareText(text):
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    tokens = hazm.word_tokenize(text)
    stemmer = hazm.Stemmer()
    words = [stemmer.stem(token) for token in tokens]
    return words
Exemplo n.º 3
0
def countTextWords(text):
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    tokens = hazm.word_tokenize(text)
    stemmer = hazm.Stemmer()
    words = [stemmer.stem(token) for token in tokens]
    return len(words)
Exemplo n.º 4
0
 def __init__(self):
     self.preprocessed_docs = []
     self.normalizer = hazm.Normalizer()
     self.word_tokenizer = hazm.WordTokenizer()
     self.stemmer = hazm.Stemmer()
     self.stop_words = hazm.stopwords_list()
     self.persian_garbage = {
         u'÷': u'',
         u'ٰ': u'',
         u'،': ' ',
         u'؟': ' ',
         u'؛': '',
         u'َ': '',
         u'ُ': '',
         u'ِ': '',
         u'ّ': '',
         u'ٌ': '',
         u'ٍ': '',
         u'ئ': u'ی',
         u'ي': u'ی',
         u'ة': u'ه',
         u'ء': u'',
         u'ك': u'ک',
         u'ْ': u'',
         u'أ': u'ا',
         u'إ': u'ا',
         u'ؤ': u'و',
         u'×': u'',
         u'٪': u'',
         u'٬': u'',
         u'آ': u'ا',
         u'●': u''
     }
Exemplo n.º 5
0
def textNormalizer(lousyCollection):
    docs = list()
    normalizer = hz.Normalizer()
    lemmatizer = hz.Lemmatizer()
    stemmer = hz.Stemmer()
    for i in range(len(lousyCollection)):
        normalized = normalizer.normalize(lousyCollection[i])
        docs.append(delete_Punc(normalized))
    for doc in docs:
        tokens = hz.word_tokenize(doc)
        for token in tokens:
            tokens[tokens.index(token)] = lemmatizer.lemmatize(
                stemmer.stem(token))
        docs[docs.index(doc)] = tokens
    return docs
    def clean_fa(self, data):
        data.text = self.fa_normalize(data.text)
        data.text = self.tokenizer(data.text)

        stemmer = hazm.Stemmer()
        lemmatizer = hazm.Lemmatizer()
        stopwords = hazm.stopwords_list()
        alphabet = set(list("ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی"))

        data.text = data.apply(
            lambda row: self.stemLemmaStopWord(
                stemmer, lemmatizer, stopwords, alphabet, row.text
            ),
            axis=1,
        )
        return data
Exemplo n.º 7
0
def similar(s1, s2):
    normalizer = hazm.Normalizer()
    s1 = normalizer.normalize(s1)
    s2 = normalizer.normalize(s2)

    list_s1 = [
        word for word in s1.split(" ") if word not in hazm.stopwords_list()
    ]
    list_s2 = [
        word for word in s2.split(" ") if word not in hazm.stopwords_list()
    ]

    stemmer = hazm.Stemmer()
    stem_s1 = [stemmer.stem(word) for word in list_s1]

    same_words = set.intersection(set(list_s1), set(list_s2))
    return len(same_words)
Exemplo n.º 8
0
    def stemming(self):
        if self.persian:
            stemmer = hazm.Stemmer()
            lemmatizer = hazm.Lemmatizer()
            for i in range(len(self.tokens)):
                self.tokens[i] = lemmatizer.lemmatize(
                    stemmer.stem(self.tokens[i]))
        else:
            porter = nltk.PorterStemmer()
            self.tokens = [porter.stem(word) for word in self.tokens]

            lemma = nltk.WordNetLemmatizer()
            self.tokens = [
                lemma.lemmatize(word, pos="v") for word in self.tokens
            ]
            self.tokens = [
                lemma.lemmatize(word, pos="n") for word in self.tokens
            ]
 def __init__(self,
              mask=None,
              size=900,
              stop_words_addr=default_stop_words_path,
              mask_addr=None):
     self.hazm_normalizer = hazm.Normalizer()
     self.parsivar_normalizer = parsivar.Normalizer()
     self.stemmer = hazm.Stemmer()
     self.lemmatizer = hazm.Lemmatizer()
     self.stop_words = set(hazm.stopwords_list(stop_words_addr))
     mask = np.array(
         Image.open(mask_addr)) if mask_addr is not None else None
     self.generator = WordCloud(width=size,
                                height=size,
                                include_numbers=False,
                                persian_normalize=False,
                                collocations=True,
                                mask=mask,
                                background_color='white')
Exemplo n.º 10
0
 def word_counter(self, text: str) -> (float, dict):
     text = text.lower()
     text = text.translate(str.maketrans(
         {'#': ' ', '$': ' ', '/': ' ', '+': ' ', '=': ' ', ':': ' ', ',': ' ', ';': ' ', '؛': ' ', '،': ' ',
          '.': ' ', '!': ' ', '؟': ' ', '?': ' ', '«': ' ', '»': ' ', '(': ' ', ')': ' ', '_': ' ', '-': ' ',
          '@': ' '}))
     text = hazm.Normalizer().normalize(text)
     text = hazm.word_tokenize(text)
     stemmer = hazm.Stemmer()
     keywords_dic = {word: 0 for word in self.keywords.keys()}
     value = 0.0
     for i in range(len(text)):
         stemmed_word = stemmer.stem(text[i])
         if stemmed_word in keywords_dic:
             keywords_dic[stemmed_word] += 1
             if keywords_dic[stemmed_word] == 1:  # count each word only once
                 value += self.keywords[stemmed_word]
         if stemmed_word in self.filter_words:
             return 0, {}
     return value, keywords_dic
def preprocess_farsi(text):
    prohibitedWords = ['[[', ']]', '{{', '}}', '{|', '|', '*', '==', '=', '\'\'\'' ,'_']
    big_regex = re.compile('|'.join(map(re.escape, prohibitedWords)))
    new_text = big_regex.sub(" ", text)
    # print(new_text)
    ### Remove English characters
    new_text = re.sub(r'[a-zA-Z]','', new_text)
    ### Remove punctuation
    new_text = re.sub(r'[^\w\s]', ' ', new_text)
    normalizer = hazm.Normalizer(remove_extra_spaces=True, persian_style=True, persian_numbers=True, remove_diacritics=True, affix_spacing=True, token_based=False, punctuation_spacing=True)
    new_text = normalizer.normalize(new_text)
    ### Remove numbers
    new_text = re.sub(r'[۱۲۳۴۵۶۷۸۹۰]', ' ', new_text)
    ### Not in HAZM
    # new_text = new_text.replace('گی','ه')
    tokens = hazm.word_tokenize(new_text)
    stemmer = hazm.Stemmer()

    tokens = [word.replace('\u200c', '‌') for word in tokens ]
    tokens = [stemmer.stem(word) for word in tokens]
    tokens = [word for word in tokens if word != '' ]
    return tokens
Exemplo n.º 12
0
def get_corrected_word(word):
    if word[0] in string.ascii_letters:
        stemmer = nltk.PorterStemmer()
    else:
        stemmer = hazm.Stemmer()
    word = stemmer.stem(word)

    if word in positional_indexer.inverted_index:
        return word

    good_words = []
    for w in positional_indexer.get_all_words():
        if nltk.jaccard_distance(set(nltk.ngrams(w, n=2)),
                                 set(nltk.ngrams(word, n=2))) < 0.3:
            good_words.append(w)

    best_word = '###########################'
    for w in good_words:
        if nltk.edit_distance(w, word) < nltk.edit_distance(best_word, word):
            best_word = w

    return best_word
Exemplo n.º 13
0
    def __init__(self, rouge_types, use_stemmer=False, lang='en'):
        """Initializes a new RougeScorer.

        Valid rouge types that can be computed are:
          rougen (e.g. rouge1, rouge2): n-gram based scoring.
          rougeL: Longest common subsequence based scoring.

        Args:
          rouge_types: A list of rouge types to calculate.
          use_stemmer: Bool indicating whether Porter stemmer should be used to
            strip word suffixes to improve matching.
          lang: Handling language input.
        Returns:
          A dict mapping rouge types to Score tuples.
        """

        self.rouge_types = rouge_types
        self.lang = lang

        if lang == 'fa':
            self._stemmer = hazm.Stemmer() if use_stemmer else None
        else:
            self._stemmer = porter.PorterStemmer() if use_stemmer else None
normalizer = hazm.Normalizer()
# normalizer.normalize('اصلاح نويسه ها و استفاده از نیم‌فاصله پردازش را آسان مي كند')
# 'اصلاح نویسه‌ها و استفاده از نیم‌فاصله پردازش را آسان می‌کند'


# sentence tokenizer
# hazm.sent_tokenize('ما هم برای وصل کردن آمدیم! ولی برای پردازش، جدا بهتر نیست؟')
# ['ما هم برای وصل کردن آمدیم!', 'ولی برای پردازش، جدا بهتر نیست؟']

# word tokenizer
# hazm.word_tokenize('ولی برای پردازش، جدا بهتر نیست؟')
# ['ولی', 'برای', 'پردازش', '،', 'جدا', 'بهتر', 'نیست', '؟']


# Stemmer
stemmer = hazm.Stemmer()
# stemmer.stem('کتاب‌ها')
# 'کتاب'


# Lemmatizer
lemmatizer = hazm.Lemmatizer()
# lemmatizer.lemmatize('می‌روم')
# 'رفت#رو'


# Tagger
# tagger = hazm.POSTagger(model='resources/postagger.model')
# tagger.tag(hazm.word_tokenize('ما بسیار کتاب می‌خوانیم'))
# [('ما', 'PRO'), ('بسیار', 'ADV'), ('کتاب', 'N'), ('می‌خوانیم', 'V')]
Exemplo n.º 15
0
class TextNormalizer:
    to_remove_fa = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUZWXYZ" + "،؛,|:»«َُِّ<>؟÷×" + "٬ًٌٍؘَؙُؚِّْٰٕٖٜٟؐؑؒؓؔؕؖؗٓٔٗ٘ٙٚٛٝٞ"
    regex_fa_space = re.compile('[%s]' % re.escape(string.punctuation))
    regex_fa_none = re.compile('[%s]' % re.escape(to_remove_fa))
    to_replace_fa = [
        ("\u200c", " "), ("ـ", ""), ("آ", "ا"), ("ۀ", "ه"), ("ة", "ه"),
        ("ي", "ی"), ("ئ", "ی"), ("ء", ""), ("أ", "ا"), ("إ", "ا"), ("ؤ", "و"),
        ("ك", "ک"), ("۰", "0"), ("۱", "1"), ("۲", "2"), ("۳", "3"), ("۴", "4"),
        ("۵", "5"), ("۶", "6"), ("۷", "7"), ("۸", "8"), ("۹", "9")
    ]

    stemmer = hazm.Stemmer()

    regex_en_space = re.compile('[%s]' % re.escape(string.punctuation))
    regex_en_unwanted = re.compile(r'[^A-Za-z0-9\s]+')
    porter_stemmer = nltk.stem.PorterStemmer()

    @staticmethod
    def prepare_text(text, lang="fa", tokenize=True):
        if lang == "fa":
            return TextNormalizer.prepare_persian_text(text, tokenize)
        return TextNormalizer.prepare_english_text(text, tokenize)

    @staticmethod
    def get_word_language(text):
        farsi = False
        i = 0
        for ch in text:
            i += 1
            name = unicodedata.name(ch).lower()
            if 'arabic' in name or 'farsi' in name or 'persian' in name:
                farsi = True
                break
            if i == 10:
                break
        return "fa" if farsi else "en"

    @staticmethod
    def prepare_english_text(text, tokenize):
        t = text.casefold()
        t = TextNormalizer.regex_en_space.sub(' ', t)
        t = TextNormalizer.regex_en_unwanted.sub(' ', t)
        tokens = nltk.tokenize.word_tokenize(t)
        stemmed_tokens = []
        for x in tokens:
            word = TextNormalizer.porter_stemmer.stem(x)
            if word != "":
                stemmed_tokens.append(word)
        if tokenize:
            return stemmed_tokens
        return " ".join(stemmed_tokens)

    @staticmethod
    def prepare_persian_text(text, tokenize):
        t = text
        for tup in TextNormalizer.to_replace_fa:
            for ch in tup[0]:
                t = t.replace(ch, tup[1])
        t = TextNormalizer.regex_fa_space.sub(' ', t)
        t = TextNormalizer.regex_fa_none.sub('', t)
        tokens2 = hazm.word_tokenize(t)
        tokens = []
        for x in tokens2:
            tokens.extend(x.split("_"))
        stemmed_tokens = []
        for x in tokens:
            word = TextNormalizer.stemmer.stem(x)
            if word != "":
                stemmed_tokens.append(word)
        if tokenize:
            return stemmed_tokens
        return " ".join(stemmed_tokens)
Exemplo n.º 16
0
# Modules
import hazm as hz
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
import xml.etree.ElementTree as et
from os import listdir
from os.path import isfile, join
from collections import defaultdict

# Parameters
normalizer = hz.Normalizer()
tagger = hz.POSTagger(model='resources/postagger.model')
stemmer = hz.Stemmer()
lemmatizer = hz.Lemmatizer()

lexicon_file_name = 'final_lexi'
data_path = './data/'

lexicon = None


# Make bag_of_words
def bow(text):
    global normalizer
    global tagger
    global stemmer
    global lemmatizer
Exemplo n.º 17
0
import hazm as Hazm
import sys
from StopWords import stop_words
import re
import json
from wordfreq import zipf_frequency

if len(sys.argv) < 2:
    print('error')
    sys.exit()

raw_text = str(sys.argv[1])

normalizer_instance = Hazm.Normalizer()
lemmatizer_instance = Hazm.Lemmatizer()
stem_finder_instance = Hazm.Stemmer()
remove_non_persian_regex = re.compile('[^آ-ی]')
raw_text = remove_non_persian_regex.sub(
    ' ', raw_text)  #We replace all non persian texts
normalized_text = normalizer_instance.normalize(raw_text)
sentences = Hazm.sent_tokenize(normalized_text)

result_tokens = list()
less_accurate_tokens = list()


def add_to_tokens_if_not_exists(parsed_token):
    exists = False
    for result_token in result_tokens:
        if parsed_token == result_token:
            exists = True
Exemplo n.º 18
0
zoomitComments.columns
zoomitComments.head()
zoomitComments=zoomitComments.drop(["ParentCommentid","UpdateDate2","CreateDate2","UpdatedByUserId","Name","Email"], axis=1)

zoomitComments['Message']=zoomitComments['Message'].astype(str)
zoomitComments['Message'] = zoomitComments['Message'].agg(lambda x: reg.sub('[<br />]',' ',x))
zoomitComments['wordCount'] = zoomitComments["Message"].agg(lambda x: len(x.split(" ")))
zoomitComments['charCount'] = zoomitComments["Message"].agg(lambda x: len(x))
zoomitComments['Message'] = zoomitComments['Message'].agg(lambda x: reg.sub('\s+',' ',x))

#zoomitComments['Message']=zoomitComments['Message'].agg(lambda x: (' ').join(reg.sub('.','',[w for w in x.split() if reg.match('([\w]+\.)+[\w]+(?=[\s]|$)',w)]))

stopWords=hm.stopwords_list()
zoomitComments['#_of_StopWords']=zoomitComments['Message'].agg(lambda x: len([w for w in x.split() if w in stopWords]))

stemWords=hm.Stemmer()
zoomitComments['Message']=zoomitComments['Message'].agg(lambda x: (' ').join([stemWords.stem(w) for w in x.split()]))

pubComment=zoomitComments.loc[zoomitComments['Status']==1,:].loc[:,['Message']]
unpubComment=zoomitComments.loc[zoomitComments['Status']==0,:].loc[:,['Message']]


len(unpubComment)
zoomitComments['Status'].unique()


import matplotlib.pyplot as pPlot
from PIL import Image


commentsWord=""
Exemplo n.º 19
0
def persian_stemmer(list):
    hazm_stemmer = hazm.Stemmer()
    output = []
    for token in list:
        output.append(hazm_stemmer.stem(token))
    return output
Exemplo n.º 20
0
 def stem(self, word):
     return hazm.Stemmer().stem(word)
Exemplo n.º 21
0
 def stem(self, token_list):
     stemmer = hazm.Stemmer()
     return [stemmer.stem(token) for token in token_list]
Exemplo n.º 22
0
 def __init__(self):
     self.Normalizer = hazm.Normalizer()
     self.stopwords_list = hazm.stopwords_list()
     self.Stemmer = hazm.Stemmer()