Exemplo n.º 1
0
def best_match(i, text):
    candidates = enumerate(reversed(cost[max(0, i - max_word):i]))
    _pick_me = []
    for k, cost_pr in candidates:
        _pick_me.append((cost_pr + word_cost.get(correction(text[i - k - 1:i]), 9e999), k + 1))
    cost_pr, k = min(tuple(_pick_me))
    return cost_pr, k
Exemplo n.º 2
0
def correct_match(match):
    # print(match)
    word = match.group()
    word = delete_rule(word)
    num_loc = [f.span() for f in re.finditer('[0-9]', word)]
    chr_loc = [f.span() for f in re.finditer('[a-zA-Z]', word)]
    if bool(num_loc) and len(word) < 5 and bool(chr_loc):
        letters = ''.join(re.findall('[a-zA-Z]', word))
        numbers = ''.join(re.findall('[0-9]', word))
        num_end_loc = num_loc[-1][1]
        if num_end_loc == len(word):
            new_word = letters + numbers
            return case_of(new_word)(correction(letters.lower()) + numbers)
        else:
            new_word = numbers + letters
            return case_of(new_word)(numbers + correction(letters.lower()))
    else:
        return case_of(word)(correction(word.lower()))
Exemplo n.º 3
0
def connect_split_word_main(word_need_connect):
    def _max_total(_candidates_list):
        total_score = 0
        uni_candidate, bi_candidate = _candidates_list
        uni_candidate = list(uni_candidate)
        bi_candidate = list(bi_candidate)
        if bi_candidate is not []:
            for x in range(len(bi_candidate)):
                if uni_score_dict[uni_candidate[x]]:
                    p_x = uni_score_dict[uni_candidate[x]]
                    p_xy = bi_score_dict[bi_candidate[x]]
                    p_y_x = p_xy / p_x
                    total_score += p_y_x
                else:
                    total_score += 0
        else:
            total_score += uni_score_dict[uni_candidate[0]]
        return total_score

    words = word_need_connect.split(' ')
    candidates_list = get_candidates_list(words)

    candidate_group = []
    candidate_dict = {}
    for candi in tqdm(candidates_list):
        candi_condition = []
        new_candi = [correction(c.lower()) for c in candi]
        candi_bi = list(ngrams(new_candi, 2))
        candi_uni = list(ngrams(new_candi, 1))
        candi_condition.append(tuple(candi_uni))
        candi_condition.append(tuple(candi_bi))
        candidate_group.append(candi_condition)
        candidate_dict[tuple(candi_condition)] = new_candi

    tokens = sentence_corpus_to_tokens()
    uni_score_dict, bi_score_dict = get_score_dicts_from_tokens(tokens)
    connected_words = ' '.join(candidate_dict[tuple(
        max(candidate_group, key=_max_total))])
    return connected_words
Exemplo n.º 4
0
def segment_correct_text(text):
    global word_cost
    global max_word
    global cost

    cleaned_text = clean_words_in_dict()
    word_cost = dict((k, np.log((i + 1) * np.log(len(cleaned_text)))) for i, k in enumerate(cleaned_text))
    max_word = max(len(x) for x in cleaned_text)
    cost = [0]

    for i in range(1, len(text) + 1):
        c, k = best_match(i, text)
        cost.append(c)

    out = []
    i = len(text)
    while i > 0:
        c, k = best_match(i, text)
        assert c == cost[i]
        out.append(correction(text[i - k:i]))
        i -= k

    final = " ".join(reversed(out))
    return final