return features def _extract_features(doc): return [_doc2features(doc, i) for i in range(len(doc))] crf = sklearn_crfsuite.CRF( algorithm='pa', #c1=0.1, #c2=0.1, max_iterations=450, #500, all_possible_transitions=True, model_filename=templates_file) check = NorvigSpellChecker() checking = "" def spell(text, autocorrect=False, worddict=None): global check, checking word_cut = word_tokenize(text) if worddict == "thai2fit" and checking == "": from pythainlp.word_vector import get_model words = get_model().index2word w_rank = {} for i, word in enumerate(words): w_rank[word] = i word = w_rank.items() check = NorvigSpellChecker(custom_dict=word) checking = "thai2fit"
def spell(text, autocorrect=False, worddict=None): global check, checking word_cut = word_tokenize(text) if worddict == "thai2fit" and checking == "": from pythainlp.word_vector import get_model words = get_model().index2word w_rank = {} for i, word in enumerate(words): w_rank[word] = i word = w_rank.items() check = NorvigSpellChecker(custom_dict=word) checking = "thai2fit" elif checking == "thai2fit" and worddict != None: pass else: checking = "" check = NorvigSpellChecker() #print(word_cut) X_test = _extract_features([(i, ) for i in word_cut]) #print(X_test) y_ = crf.predict_single(X_test) x = [(word_cut[i], data) for i, data in enumerate(y_)] output = "" temp = '' #print(x) for i, b in enumerate(x): if i == len(x) - 1 and 'B' in b[1] and temp == 'B': output += "</คำผิด><คำผิด>" + b[0] + "</คำผิด>" temp = 'B' elif i == len(x) - 1 and 'B' in b[1]: output += "<คำผิด>" + b[0] + "</คำผิด>" temp = 'B' elif 'B-' in b[1] and temp == 'B': output += "</คำผิด><คำผิด>" + b[0] temp = 'B' elif 'B-' in b[1]: output += "<คำผิด>" + b[0] temp = 'B' elif 'O' in b[1] and temp == 'B': output += "</คำผิด>" + b[0] temp = 'O' elif i == len(x) - 1 and 'I' in b[1] and temp == 'B': output += b[0] + "</คำผิด>" temp = 'O' else: output += b[0] if autocorrect: f = "(<คำผิด>)(.*)(</คำผิด>)" output = output.replace("<คำผิด>", "|---|<คำผิด>|---|").replace( "</คำผิด>", "|---|</คำผิด>|---|") listall = output.split("|---|") i = 0 output = "" ii = len(listall) while i < ii: if listall[i] == "<คำผิด>": output += check.correct(listall[i + 1]) i += 3 else: output += listall[i] i += 1 return output
# -*- coding: utf-8 -*- """ Spell checking and spelling correction. """ __all__ = [ "DEFAULT_SPELL_CHECKER", "correct", "spell", "NorvigSpellChecker", "spell_sent", "correct_sent" ] from pythainlp.spell.pn import NorvigSpellChecker DEFAULT_SPELL_CHECKER = NorvigSpellChecker() from pythainlp.spell.core import correct, spell, correct_sent, spell_sent