Пример #1
0
def clean_persianText(txt):
    normalizer = Normalizer()
    txt = normalizer.character_refinement(txt)
    txt = normalizer.affix_spacing(txt)
    txt = normalizer.punctuation_spacing(txt)
    txt = txt.replace('.', '')
    txt = normalizer.normalize(txt)
    return txt
Пример #2
0
data = []

# f = open("Question_Body.txt", "r", encoding='utf-8-sig')
# q = f.readlines()
# f = open("answer_clean.txt", "r", encoding='utf-8-sig')
# a = f.readlines()
# for _q in q:
data.append("سلام سلام خوبی سلام")

vocab = []

for w in data:
    txt = w.replace("\u200c", " ")
    txt = removeIrritate(txt)
    txt = normalizer.character_refinement(txt)
    txt = normalizer.affix_spacing(txt)
    txt = normalizer.punctuation_spacing(txt)
    txt = txt.replace('.', '')
    txt = normalizer.normalize(txt)

    #txt = my_normalizer.normalize(txt)
    tks = txt.split()
    for tk in tks:
        w = Stem(tk)
        r1 = w in vocab
        r2 = not (w in vocab)
        if not w in vocab:
            vocab.append(w)
f = open('vocab.txt', 'w', encoding='utf-8')
s1 = '\n'.join(vocab1)