Пример #1
0
    def normalize(self, word = u""):
        """
        Normalize a word.
        Convert some leters forms into unified form.
        @param word: the input word, if word is empty, 
        the word member of the class is normalized.
        @type word: unicode.
        @return: normalized word.
        @rtype: unicode.
        """

        if word ==  u'' and self.word ==  u"":
            return u""
        elif word != u'':
            self.word = word
        else:
            word = self.word
        self.normalized = normalize.normalize_searchtext(word)
        return self.normalized
Пример #2
0
from pyquery import PyQuery as pq
import tashaphyne.normalize as norm
import Levenshtein

almizan = codecs.open('data/output-convert.html', encoding='utf-8').read()
quran = codecs.open('data/quran.txt', encoding='utf-8').readlines()
errors = open('data/errors.txt','w')

d = pq(almizan)
for i, aya in enumerate(d('blockquote p')):
	aya = pq(aya)
	"""if aya.parent().prev()[0].tag != 'h1':
		continue;"""
	s, a, max, c = 0, 0, 0, 0
	for ayah in quran:
		ayah = norm.normalize_searchtext(ayah)
		aya1 = norm.normalize_searchtext(aya[0].text)
		if max < Levenshtein.ratio(ayah,aya1):
			max = Levenshtein.ratio(ayah,aya1)
		if Levenshtein.ratio(ayah, aya1) > 0.75:
			match = re.search(r'\d+', ayah)
			s = int(match.group(0))
			match = re.search(r'\|\d+', ayah)
			a = int(match.group(0)[1:])
			print('L: %s-%s' %(s,a))
			for err in quran[:c]:
				errors.write(err.encode('utf-8'))
			del quran[:c+1]
			break
		c += 1
	if max <= 0.75:
Пример #3
0
		while(i <= aya_end):
			aya_num = str(sura) +'|'+ str(i)
			for c in range(len(quran)):
				q_aya = quran[c]
				if(q_aya.startswith(aya_num)):
					sec_ayas.append(q_aya)
					quran = quran[c+1:]
					break
			i += 1	

		counter = 1
		for part in sec.find("em"):
			success = 0
			part = pq(part)
			try:
				part_text = norm.normalize_searchtext(part[0].text)
				part_text = isri.stem(part_text)
			except:
				part_text = part[0].text
				try:
					errors.write(part.outerHtml())
				except:
					pass
			print counter
			counter += 1
			if part == None or part[0].text == None:
				continue
			if  len(part[0].text)<=2:
				continue
			
			for aya in sec_ayas:
Пример #4
0
from nltk.stem import arlstem
from nltk.corpus import stopwords
from tashaphyne import normalize

data = pd.read_excel('../feature datasets/ar/labels.xlsx', encoding = 'utf-8', index_col = 0)

# Tokenize / Filter / Normalize
for index, row in data.iterrows():
    # Tokenization
    comment_tokens = re.split("[;., \-!?:\*]+", row['text'])
    
    #stemmer = arlstem.ARLSTem()
    #comment_tokens = list(map(stemmer.stem, comment_tokens))
        
    # Filtering
    filtered_words = [word for word in comment_tokens if word not in stopwords.words('arabic')]
    filtered_words = [re.sub('[^\u0621-\u0652]+', '', i) for i in filtered_words]
    filtered_words = list(filter(None, filtered_words))
    
    # Stemming
    #stemmer = arlstem.ARLSTem()
    #stemmed_words = list(map(stemmer.stem, filtered_words))

    # Normalization
    normalized = normalize.normalize_searchtext(' '.join(filtered_words))
    data.loc[index, 'text'] = normalized
    
data['text'].replace([''], np.nan, inplace=True)    
clean_data = data.dropna()
clean_data.to_excel('../feature datasets/ar/cleaned_data.xlsx', index_label="index")