def clean_persianText(txt): normalizer = Normalizer() txt = normalizer.character_refinement(txt) txt = normalizer.affix_spacing(txt) txt = normalizer.punctuation_spacing(txt) txt = txt.replace('.', '') txt = normalizer.normalize(txt) return txt
data = [] # f = open("Question_Body.txt", "r", encoding='utf-8-sig') # q = f.readlines() # f = open("answer_clean.txt", "r", encoding='utf-8-sig') # a = f.readlines() # for _q in q: data.append("سلام سلام خوبی سلام") vocab = [] for w in data: txt = w.replace("\u200c", " ") txt = removeIrritate(txt) txt = normalizer.character_refinement(txt) txt = normalizer.affix_spacing(txt) txt = normalizer.punctuation_spacing(txt) txt = txt.replace('.', '') txt = normalizer.normalize(txt) #txt = my_normalizer.normalize(txt) tks = txt.split() for tk in tks: w = Stem(tk) r1 = w in vocab r2 = not (w in vocab) if not w in vocab: vocab.append(w) f = open('vocab.txt', 'w', encoding='utf-8') s1 = '\n'.join(vocab1) f.write(s1)