Пример #1
0
    return features


def _extract_features(doc):
    return [_doc2features(doc, i) for i in range(len(doc))]


crf = sklearn_crfsuite.CRF(
    algorithm='pa',
    #c1=0.1,
    #c2=0.1,
    max_iterations=450,  #500,
    all_possible_transitions=True,
    model_filename=templates_file)
check = NorvigSpellChecker()
checking = ""


def spell(text, autocorrect=False, worddict=None):
    global check, checking
    word_cut = word_tokenize(text)
    if worddict == "thai2fit" and checking == "":
        from pythainlp.word_vector import get_model
        words = get_model().index2word
        w_rank = {}
        for i, word in enumerate(words):
            w_rank[word] = i
        word = w_rank.items()
        check = NorvigSpellChecker(custom_dict=word)
        checking = "thai2fit"
Пример #2
0
def spell(text, autocorrect=False, worddict=None):
    global check, checking
    word_cut = word_tokenize(text)
    if worddict == "thai2fit" and checking == "":
        from pythainlp.word_vector import get_model
        words = get_model().index2word
        w_rank = {}
        for i, word in enumerate(words):
            w_rank[word] = i
        word = w_rank.items()
        check = NorvigSpellChecker(custom_dict=word)
        checking = "thai2fit"
    elif checking == "thai2fit" and worddict != None:
        pass
    else:
        checking = ""
        check = NorvigSpellChecker()
    #print(word_cut)
    X_test = _extract_features([(i, ) for i in word_cut])
    #print(X_test)
    y_ = crf.predict_single(X_test)
    x = [(word_cut[i], data) for i, data in enumerate(y_)]
    output = ""
    temp = ''
    #print(x)
    for i, b in enumerate(x):
        if i == len(x) - 1 and 'B' in b[1] and temp == 'B':
            output += "</คำผิด><คำผิด>" + b[0] + "</คำผิด>"
            temp = 'B'
        elif i == len(x) - 1 and 'B' in b[1]:
            output += "<คำผิด>" + b[0] + "</คำผิด>"
            temp = 'B'
        elif 'B-' in b[1] and temp == 'B':
            output += "</คำผิด><คำผิด>" + b[0]
            temp = 'B'
        elif 'B-' in b[1]:
            output += "<คำผิด>" + b[0]
            temp = 'B'
        elif 'O' in b[1] and temp == 'B':
            output += "</คำผิด>" + b[0]
            temp = 'O'
        elif i == len(x) - 1 and 'I' in b[1] and temp == 'B':
            output += b[0] + "</คำผิด>"
            temp = 'O'
        else:
            output += b[0]
    if autocorrect:
        f = "(<คำผิด>)(.*)(</คำผิด>)"
        output = output.replace("<คำผิด>", "|---|<คำผิด>|---|").replace(
            "</คำผิด>", "|---|</คำผิด>|---|")
        listall = output.split("|---|")
        i = 0
        output = ""
        ii = len(listall)
        while i < ii:
            if listall[i] == "<คำผิด>":
                output += check.correct(listall[i + 1])
                i += 3
            else:
                output += listall[i]
                i += 1
    return output
Пример #3
0
# -*- coding: utf-8 -*-
"""
Spell checking and spelling correction.
"""

__all__ = [
    "DEFAULT_SPELL_CHECKER",
    "correct",
    "spell",
    "NorvigSpellChecker",
    "spell_sent",
    "correct_sent"
]

from pythainlp.spell.pn import NorvigSpellChecker
DEFAULT_SPELL_CHECKER = NorvigSpellChecker()

from pythainlp.spell.core import correct, spell, correct_sent, spell_sent