예제 #1
0
 def __init__(self):
     self.preprocessed_docs = []
     self.normalizer = hazm.Normalizer()
     self.word_tokenizer = hazm.WordTokenizer()
     self.stemmer = hazm.Stemmer()
     self.stop_words = hazm.stopwords_list()
     self.persian_garbage = {
         u'÷': u'',
         u'ٰ': u'',
         u'،': ' ',
         u'؟': ' ',
         u'؛': '',
         u'َ': '',
         u'ُ': '',
         u'ِ': '',
         u'ّ': '',
         u'ٌ': '',
         u'ٍ': '',
         u'ئ': u'ی',
         u'ي': u'ی',
         u'ة': u'ه',
         u'ء': u'',
         u'ك': u'ک',
         u'ْ': u'',
         u'أ': u'ا',
         u'إ': u'ا',
         u'ؤ': u'و',
         u'×': u'',
         u'٪': u'',
         u'٬': u'',
         u'آ': u'ا',
         u'●': u''
     }
예제 #2
0
def similar(s1, s2):
    normalizer = hazm.Normalizer()
    s1 = normalizer.normalize(s1)
    s2 = normalizer.normalize(s2)

    list_s1 = [
        word for word in s1.split(" ") if word not in hazm.stopwords_list()
    ]
    list_s2 = [
        word for word in s2.split(" ") if word not in hazm.stopwords_list()
    ]

    stemmer = hazm.Stemmer()
    stem_s1 = [stemmer.stem(word) for word in list_s1]

    same_words = set.intersection(set(list_s1), set(list_s2))
    return len(same_words)
예제 #3
0
def prepare_text(text):
    text = text.lower()
    text = re.sub('\d+', '', text)
    text = text.translate(str.maketrans(punctuations, ' ' * len(punctuations)))
    text = ' '.join(
        re.sub(r'[^ضصثقفغعهخحجچشسیبلاتنمکگظطزرذدپوئژآؤ \n]', ' ',
               text).split())
    text = text.strip()
    normalized_text = normalizer.normalize(text)
    words = word_tokenize(normalized_text)
    words = [w for w in words if w != '.']
    words = [w for w in words if w not in stopwords_list()]
    words = [stemmer.stem(w) for w in words]
    return words
예제 #4
0
    def __init__(self):
        self.punctuations = [
            '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-',
            '/', ':', ';', '<', '=', '>', '@', '[', '\\', ']', '^', '_', '`',
            '{', '|', '}', '~', '£', '¤', '§', '©', '«', '®', '°', '±', '²',
            '´', '¸', '»', '¼', '½', '¾', '×', '÷', 'ˈ', '˜', '˝', '٪', '٫',
            '٬', '‐', '–', '—', '‘', '’', '“', '”', '„', '…', '″', '‹', '›',
            '™', '↑', '→', '↓', '⋅', '⌘', '▪', '◄', '○', '♫', '✓', '❤', '《',
            '》', '爆', '者', '被', '\uf020', '\uf04f', '\uf05f', '\uf076',
            '\uf0a7', '\uf0fc', '﴾', '﴿', ':', '�', '?', '؟', '.', '،', '؛',
            '•', '●'
        ]
        self.diacritics_pattern = re.compile(
            "[\u064B-\u065e\u0670\u0674\u06c3\u06d4-\u06ed]")
        self.emojis_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"
            u"\U0001F300-\U0001F5FF"
            u"\U0001F680-\U0001F6FF"
            u"\U0001F1E0-\U0001F1FF"
            "]+",
            flags=re.UNICODE)
        self.latin_characters_pattern = re.compile("["
                                                   "\u0041-\u007a"
                                                   "\u00c0-\u036f"
                                                   "\u0400-\u050f"
                                                   "\u0342-\u03ff"
                                                   "]")
        self.numbers_pattern = re.compile("[0-9]")
        self.space_patterns = [
            (re.compile("[\u202c\u2005\u2009\u2029\u2066\u3000\ufe0f]"), ' '),
            (re.compile("[\f\r\t\n]"), ' '),
            (re.compile("[\u001f\u009d\u200a\u200e\u200f\u206d\xa0\xad]"),
             '\u200c'),
            (re.compile(
                "[\u007f\u0085\u061c\u200b\u200d\u202a\u202b\u206f\u2003"
                "\u2028\u2060\u2063\u2067\u2069\ufeff\ufffc\x18]"), ''),
        ]
        self.stopwords = hazm.stopwords_list()[:200] + [
            'ام', 'م', 'ات', 'ای', 'ی', 'ت', 'اش', 'ش', 'مان', 'یم', 'ایم',
            'تان', 'ید', 'اید', 'شان', 'ند', 'اند', 'است', 'هست', 'بود', 'شد',
            'شو', 'باش', 'خواه', 'ها', 'های', 'ان', 'یک', 'دو', 'سه', 'چهار',
            'پنج', 'شش', 'هفت', 'هشت', 'نه', 'ده', 'هستم', 'هستم', 'هست',
            'هستید', 'هستیم', 'نیستم', 'نیستی', 'نیست', 'نیستیم', 'نیستید',
            'نیستند'
        ]

        self.normalizer = parsivar.Normalizer()
        self.stemmer = parsivar.FindStems()
        self.lemmatizer = hazm.Lemmatizer()
예제 #5
0
    def __init__(self, feature_set, orientations=None, language='english'):
        self.language = language
        self.normalizer[language] = hazm.Normalizer()
        if language == 'persian':
            self.stopwords[language] = hazm.stopwords_list()
            self.regex_words[language] = r"[\w']+|[.,!?;،؟؛]"
        else:
            self.stopwords[language] = set(stopwords.words('english'))
            self.regex_words[language] = r"[\w']+|[.,!?;]"

        if orientations:
            self.orientations = orientations

        self.feature_set = feature_set
        self.weights = {}
        self.hash_dictionary[self.language] = {}
    def clean_fa(self, data):
        data.text = self.fa_normalize(data.text)
        data.text = self.tokenizer(data.text)

        stemmer = hazm.Stemmer()
        lemmatizer = hazm.Lemmatizer()
        stopwords = hazm.stopwords_list()
        alphabet = set(list("ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی"))

        data.text = data.apply(
            lambda row: self.stemLemmaStopWord(
                stemmer, lemmatizer, stopwords, alphabet, row.text
            ),
            axis=1,
        )
        return data
 def __init__(self,
              mask=None,
              size=900,
              stop_words_addr=default_stop_words_path,
              mask_addr=None):
     self.hazm_normalizer = hazm.Normalizer()
     self.parsivar_normalizer = parsivar.Normalizer()
     self.stemmer = hazm.Stemmer()
     self.lemmatizer = hazm.Lemmatizer()
     self.stop_words = set(hazm.stopwords_list(stop_words_addr))
     mask = np.array(
         Image.open(mask_addr)) if mask_addr is not None else None
     self.generator = WordCloud(width=size,
                                height=size,
                                include_numbers=False,
                                persian_normalize=False,
                                collocations=True,
                                mask=mask,
                                background_color='white')
예제 #8
0
    def search_wikipedia(cls, word):
        page = requests.get("https://fa.wikipedia.org/wiki/" + word)
        soup = BeautifulSoup(page.content, features="html.parser")

        word_tokenized = []
        try:
            size = len(soup.find_all('p'))
            content = soup.find_all('p')
            for i in range(size):
                word_tokenized.append(word_tokenize(content[i].get_text()))

            filtered_words = []
            for list in word_tokenized:
                for word in list:
                    if word not in stopwords_list():
                        if word.isalpha():
                            filtered_words.append(word)

            most_common_words = (
                collections.Counter(filtered_words).most_common(10))
            return most_common_words

        except:
            return "error"
예제 #9
0
from hazm import stopwords_list


def get_stopwords():
    with open("stopwords.txt", 'r') as f:
        results = f.readline().split()
    return results


stopwords = stopwords_list()
punctuation = get_stopwords()

all_stopwords = punctuation + stopwords + ["NUM"] + [
    'آقا', 'آور', 'افزا', 'باش', 'بردار', 'بست', 'بند', 'توان', 'توانست',
    'دارا', 'دان', 'ده', 'رس', 'ریخت', 'ریز', 'سال', 'سو', 'شخص', 'شو', 'هست',
    'وقت', 'کس', 'کن', 'گذار', 'گذاشت', 'گرد', 'گشت', 'گو', 'گیر', 'یاب'
] + ['بس']
import pickle

from hazm import Normalizer, Stemmer, Lemmatizer, sent_tokenize, word_tokenize, stopwords_list

stops = set(stopwords_list())


def load_dataset(file_name, column_name='question'):
    data = pickle.load(open(file_name, "rb"))
    statements = []
    for i in range(len(data)):
        statements.append(data[i][column_name])
    return statements


def statement_pre_processing(input_statement):
    normalizer = Normalizer()
    lemmatizer = Lemmatizer()
    input_statement = normalizer.normalize(input_statement)
    input_statement = [
        lemmatizer.lemmatize(word) for word in word_tokenize(input_statement)
        if word not in stops
    ]
    return input_statement


def dataset_cleaner(dataset):
    statements = []
    normalizer = Normalizer()
    lemmatizer = Lemmatizer()
    for i in range(len(dataset)):
예제 #11
0
zoomitComments.dtypes
zoomitComments.describe()
zoomitComments.columns
zoomitComments.head()
zoomitComments=zoomitComments.drop(["ParentCommentid","UpdateDate2","CreateDate2","UpdatedByUserId","Name","Email"], axis=1)

zoomitComments['Message']=zoomitComments['Message'].astype(str)
zoomitComments['Message'] = zoomitComments['Message'].agg(lambda x: reg.sub('[<br />]',' ',x))
zoomitComments['wordCount'] = zoomitComments["Message"].agg(lambda x: len(x.split(" ")))
zoomitComments['charCount'] = zoomitComments["Message"].agg(lambda x: len(x))
zoomitComments['Message'] = zoomitComments['Message'].agg(lambda x: reg.sub('\s+',' ',x))

#zoomitComments['Message']=zoomitComments['Message'].agg(lambda x: (' ').join(reg.sub('.','',[w for w in x.split() if reg.match('([\w]+\.)+[\w]+(?=[\s]|$)',w)]))

stopWords=hm.stopwords_list()
zoomitComments['#_of_StopWords']=zoomitComments['Message'].agg(lambda x: len([w for w in x.split() if w in stopWords]))

stemWords=hm.Stemmer()
zoomitComments['Message']=zoomitComments['Message'].agg(lambda x: (' ').join([stemWords.stem(w) for w in x.split()]))

pubComment=zoomitComments.loc[zoomitComments['Status']==1,:].loc[:,['Message']]
unpubComment=zoomitComments.loc[zoomitComments['Status']==0,:].loc[:,['Message']]


len(unpubComment)
zoomitComments['Status'].unique()


import matplotlib.pyplot as pPlot
from PIL import Image
예제 #12
0
import hazm as hz
from PersianStemmer import PersianStemmer
import search_engine.words_lists as wl
from itertools import combinations
import re
import string
import search_engine.configurations as config

ps = PersianStemmer()
stop_words = hz.stopwords_list()
normalizer = hz.Normalizer()
stemmer = hz.Stemmer()
lemmatizer = hz.Lemmatizer()


def process_single_document(doc_content):
    # 1: remove html tags and irrelevant contents
    cleaned_content_from_tag = remove_tags(doc_content)
    # 2: normalize text
    normalize_text = normalizer.normalize(cleaned_content_from_tag)
    # 3: tokenize
    words_token = hz.word_tokenize(normalize_text)
    config.pure_number_tokens += len(words_token)
    return words_token


def preprocess_single_word_in_query(word):
    # 1: normalized
    word = normalizer.normalize(word)
    # 2: lemmatized and stemmer
    word_lemmatized = lemmatizer.lemmatize(word)
예제 #13
0
 def __init__(self):
     self.Normalizer = hazm.Normalizer()
     self.stopwords_list = hazm.stopwords_list()
     self.Stemmer = hazm.Stemmer()
예제 #14
0
class Preprocessor:
    normalizer = Normalizer()
    stemmer = Stemmer()
    lemmatizer = Lemmatizer()
    tokenizer = WordTokenizer()
    stop_words = stopwords_list()

    @staticmethod
    def remove_noise(text: str) -> str:
        return Preprocessor.__remove_punctuation(
            Preprocessor.__remove_emojis(text))

    @staticmethod
    def remove_stop_words(tokens: List) -> str:
        return [t for t in tokens if t not in Preprocessor.stop_words]

    @staticmethod
    def __remove_emojis(text: str):

        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u'\U00010000-\U0010ffff'
            u"\u200d"
            u"\u2640-\u2642"
            u"\u2600-\u2B55"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\u3030"
            u"\ufe0f"
            "]+",
            flags=re.UNICODE)

        first_cleaned_text = emoji_pattern.sub(r'', text)  # no emoji
        return emoji.get_emoji_regexp().sub(r'', first_cleaned_text)

    @staticmethod
    def __remove_punctuation(text: str):
        try:
            return re.sub(
                r'[\.\?\!\,\:\;\،\(\)\؛\#\%\^\&\$\~\'\"\×\-\_\*\>\<\+\=\\\/]',
                '', text)
        except TypeError as e:
            print(e, text)

    @staticmethod
    def normalize(text: str) -> str:
        return Preprocessor.normalizer.normalize(text)

    @staticmethod
    def stem(word: str) -> str:
        return Preprocessor.stemmer.stem(word)

    @staticmethod
    def lemmatize(word: str) -> str:
        return Preprocessor.lemmatizer.lemmatize(word)

    @staticmethod
    def tokenize(text: str) -> str:
        return Preprocessor.tokenizer.tokenize(text)

    @staticmethod
    def preprocess(text: str) -> str:
        cleaned_text = Preprocessor.remove_noise(str(text))
        normalized_text = Preprocessor.normalize(cleaned_text)
        tokens = Preprocessor.tokenize(normalized_text)
        none_stop_words = Preprocessor.remove_stop_words(tokens)
        # stems = [Preprocessor.stem(w) for w in tokens]
        lemmatized = [Preprocessor.lemmatize(w) for w in none_stop_words]
        return ' '.join(lemmatized)
예제 #15
0
# ایجاد یک دیکشنری
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Label']].values)

# ایجاد یک نمود فراوانی تکرار ایموجی‌ها
fig = plt.figure(figsize=(8,6))
df2.groupby('Label').comment.count().sort_values().plot.bar(
    ylim=0, title= 'Term Frequency of each Emoji \n')
plt.xlabel('\n Number of ocurrences', fontsize = 10);
plt.show()

# پالایش کامنت‌ها
normalizer =  hazm.Normalizer()
tokenizer = hazm.SentenceTokenizer()
tokens = hazm.word_tokenize 
S_Words = list(hazm.stopwords_list())

#بازنمایی متن
tfidf = TfidfVectorizer(lowercase=False, 
                        preprocessor=normalizer.normalize, 
                        tokenizer=tokens,
                        ngram_range=(1, 2),
                        stop_words=S_Words)
comments = df2.comment
features = tfidf.fit_transform(comments).toarray()
labels = df2.category_id

# مقایسه همه مدل‌ها
models = [
    MultinomialNB(),
    RandomForestClassifier(n_estimators=200, max_depth=5, random_state=0),
        punctuations_list = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation

        def remove_punctuations(text):
            translator = str.maketrans('', '', punctuations_list)
            return text.translate(translator)

        words = remove_punctuations(words)
        words = re.sub(
            'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))',
            '',
            words,
            flags=re.MULTILINE)
        words = re.sub(r"@(\w+)", ' ', words, flags=re.MULTILINE)
        wordcloud = WordCloudFa(persian_normalize=True,
                                stopwords=list(STOPWORDS) +
                                hazm.stopwords_list(),
                                include_numbers=False,
                                background_color='white',
                                width=700,
                                height=500)
        frequencies = wordcloud.process_text(words)
        wc = wordcloud.generate_from_frequencies(frequencies)
        image = wc.to_image()
        st.image(image)

        # Dataframe
        st.subheader('**Data**')
        st.write(data)
        # Random Tweet
        col1, col2 = st.beta_columns(2)
        with col1:
예제 #17
0
import hazm

stopwords = ['ام', 'م', 'ات', 'ای', 'ی', 'ت', 'اش', 'ش', 'مان', 'یم', 'ایم', 'تان', 'ید', 'اید', 'شان', 'ند', 'اند',
             'است', 'هست', 'بود', 'شد', 'شو', 'باش', 'خواه', 'ها', 'های', 'ان', 'یک', 'دو', 'سه', 'چهار', 'پنج', 'شش',
             'هفت', 'هشت', 'نه', 'ده', 'هستم', 'هستم', 'هست', 'هستید', 'هستیم', 'نیستم', 'نیستی', 'نیست', 'نیستیم',
             'نیستید', 'نیستند'] + hazm.stopwords_list()[:200]
punctuations = ['.', '‫‪،‬‬', '!', '؟', '?', ':', '؛', '(', ')', '{', '}', '[', ']', '«', '»', '-', '/', '٪', '%', '"',
                "'", '،', '_', '=', '<', '>', '+', '@', '*', ',', ';', '&', '#', '٬', '`', '|', ',']
diacritics = ['\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652', '\u0653', '\u0654',
              '\u0655']
character_mapping = {
    'ا': ['ا', 'إ', 'أ', 'آ', 'ٱ'],
    'و': ['و', 'ؤ'],
    'ی': ['ی', 'ي', 'ئ'],
    'ک': ['ک', 'ك'],
    'ه': ['ه', 'ة', 'ۀ'],
}
english_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
half_spaces = ['\u200C', '\u200f', '\xa0']
multi_words_token = ['‫چنان چه‬', 'بنا بر این', 'مع ذلک', 'فی مابین', 'فوق العاده', 'بی شک', 'در خصوص', 'این که',
                     'به دلیل', 'به خاطر', 'بر اساس', 'از جمله', 'با توجه به', 'اشاره به', 'بین الملل', 'در راستا',
                     'در اختیار', 'خاطر نشان', 'ما فوق', 'بدین شکل']
digit_characters = '1234567890۱۲۳۴۵۶۷۸۹۰١٢٣٤٥٦٧٨٩'

# Heaps Law: y = 0.49 * x + 1.60
# Zipf Law: y = -1.73 * x + 8.36 ?????
    for j in dd['title'][i]:
        temp = j.split('\u200c')
        for q in temp:
            temp2.append(q)
    dd['title'][i] = copy.deepcopy(temp2)

# ## Remove stopwords and Special Chars
# Because they are useless to the meaning of sentences and as a result, to prices.

# In[9]:

special_chars = [
    '!', '"', '#', '(', ')', '*', ',', '-', '.', '/', '\'', '«', '»', '،', '؛',
    '؟', '.', '…', '$'
]
stopwords = hazm.stopwords_list()
for i in range(len(dd2)):
    for j in dd2['desc'][i]:
        if (j in special_chars) or (j in stopwords):
            while (j in dd2['desc'][i]):
                dd2['desc'][i].remove(j)

    for j in dd2['title'][i]:
        if (j in special_chars) or (j in stopwords):
            while (j in dd2['desc'][i]):
                dd2['desc'][i].remove(j)

for i in range(len(dd)):
    for j in dd['desc'][i]:
        if (j in special_chars) or (j in stopwords):
            while (j in dd['desc'][i]):