def normalize(self, word = u""): """ Normalize a word. Convert some leters forms into unified form. @param word: the input word, if word is empty, the word member of the class is normalized. @type word: unicode. @return: normalized word. @rtype: unicode. """ if word == u'' and self.word == u"": return u"" elif word != u'': self.word = word else: word = self.word self.normalized = normalize.normalize_searchtext(word) return self.normalized
from pyquery import PyQuery as pq import tashaphyne.normalize as norm import Levenshtein almizan = codecs.open('data/output-convert.html', encoding='utf-8').read() quran = codecs.open('data/quran.txt', encoding='utf-8').readlines() errors = open('data/errors.txt','w') d = pq(almizan) for i, aya in enumerate(d('blockquote p')): aya = pq(aya) """if aya.parent().prev()[0].tag != 'h1': continue;""" s, a, max, c = 0, 0, 0, 0 for ayah in quran: ayah = norm.normalize_searchtext(ayah) aya1 = norm.normalize_searchtext(aya[0].text) if max < Levenshtein.ratio(ayah,aya1): max = Levenshtein.ratio(ayah,aya1) if Levenshtein.ratio(ayah, aya1) > 0.75: match = re.search(r'\d+', ayah) s = int(match.group(0)) match = re.search(r'\|\d+', ayah) a = int(match.group(0)[1:]) print('L: %s-%s' %(s,a)) for err in quran[:c]: errors.write(err.encode('utf-8')) del quran[:c+1] break c += 1 if max <= 0.75:
while(i <= aya_end): aya_num = str(sura) +'|'+ str(i) for c in range(len(quran)): q_aya = quran[c] if(q_aya.startswith(aya_num)): sec_ayas.append(q_aya) quran = quran[c+1:] break i += 1 counter = 1 for part in sec.find("em"): success = 0 part = pq(part) try: part_text = norm.normalize_searchtext(part[0].text) part_text = isri.stem(part_text) except: part_text = part[0].text try: errors.write(part.outerHtml()) except: pass print counter counter += 1 if part == None or part[0].text == None: continue if len(part[0].text)<=2: continue for aya in sec_ayas:
from nltk.stem import arlstem from nltk.corpus import stopwords from tashaphyne import normalize data = pd.read_excel('../feature datasets/ar/labels.xlsx', encoding = 'utf-8', index_col = 0) # Tokenize / Filter / Normalize for index, row in data.iterrows(): # Tokenization comment_tokens = re.split("[;., \-!?:\*]+", row['text']) #stemmer = arlstem.ARLSTem() #comment_tokens = list(map(stemmer.stem, comment_tokens)) # Filtering filtered_words = [word for word in comment_tokens if word not in stopwords.words('arabic')] filtered_words = [re.sub('[^\u0621-\u0652]+', '', i) for i in filtered_words] filtered_words = list(filter(None, filtered_words)) # Stemming #stemmer = arlstem.ARLSTem() #stemmed_words = list(map(stemmer.stem, filtered_words)) # Normalization normalized = normalize.normalize_searchtext(' '.join(filtered_words)) data.loc[index, 'text'] = normalized data['text'].replace([''], np.nan, inplace=True) clean_data = data.dropna() clean_data.to_excel('../feature datasets/ar/cleaned_data.xlsx', index_label="index")