示例#1
0
def build_candidates_list(token, anagrams_list, ocr_sims_list, structures):
    """Merge anagram and OCRkey list into one list.

    Parameters:
        token (:func:`str`): Cleaned token
        anagrams_list (:func:`dict`): Result of `select_anagrams`
        ocr_sims_list (:func:`dict`): Result of `select_ocrsims`
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Correction tokens (keys) along with their score (values)
    """
    final_list = anagrams_list

    ocr_list = truncate_ocr_sim_list(token, ocr_sims_list)

    strong_ocr_list = ocr_list
    weak_ocr_list = {}
    if len(ocr_list) > 5:
        (strong_ocr_list, weak_ocr_list) = split_ocr_list(token, ocr_list)

    for ocr_word, ocr_score in strong_ocr_list.items():
        if ocr_word in final_list.keys():
            final_list[ocr_word] *= ocr_score
            del strong_ocr_list[ocr_word]

    strong_ocr_list.update(weak_ocr_list)

    for ocr_word, ocr_score in strong_ocr_list.items():
        if ocr_word not in final_list.keys():
            final_list[ocr_word] = rate_anagram(structures["occurence_map"], token, ocr_word, 1) \
                * rate_ocr_key(structures["occurence_map"], token, ocr_word, 0)

    return final_list
示例#2
0
def build_candidates_list(token, anagrams_list, ocr_sims_list, structures):
    """Merge anagram and OCRkey list into one list.

    Parameters:
        token (:func:`str`): Cleaned token
        anagrams_list (:func:`dict`): Result of `select_anagrams`
        ocr_sims_list (:func:`dict`): Result of `select_ocrsims`
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Correction tokens (keys) along with their score (values)
    """
    final_list = anagrams_list

    ocr_list = truncate_ocr_sim_list(token, ocr_sims_list)

    strong_ocr_list = ocr_list
    weak_ocr_list = {}
    if len(ocr_list) > 5:
        (strong_ocr_list, weak_ocr_list) = split_ocr_list(token, ocr_list)

    for ocr_word, ocr_score in strong_ocr_list.items():
        if ocr_word in final_list.keys():
            final_list[ocr_word] *= ocr_score
            del strong_ocr_list[ocr_word]

    strong_ocr_list.update(weak_ocr_list)

    for ocr_word, ocr_score in strong_ocr_list.items():
        if ocr_word not in final_list.keys():
            final_list[ocr_word] = rate_anagram(structures["occurence_map"], token, ocr_word, 1) \
                * rate_ocr_key(structures["occurence_map"], token, ocr_word, 0)

    return final_list
示例#3
0
def select_ocrsims(token, structures):
    """Select similar words for a given token

    Parameters:
        token (:func:`str`): Cleaned token
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Similar words (keys) along with their score (values)
    """
    delta = 2
    ocr_sims = {}

    word_hash = ocr_key_hash(token)

    sim_hash_list = {
    }  # Using a dictionary avoid multiple entries if a key is retrieved twice
    key_index = -1

    # for (key, value) in word_hash:
    for key, value in word_hash:
        key_index += 1
        sim_hash = deepcopy(word_hash)

        for d in range(-delta, delta + 1):
            if d != 0:
                card = max(int(value) + d, 1)

                sim_hash[key_index] = (key, card)

                # Rebuild OCR key string
                sim_hash_str = ""
                for k, v in sim_hash:
                    sim_hash_str += k + str(v)

                if sim_hash_str in structures["ocrkeys"]:
                    card_diff = abs(int(value) - card)

                    sim_hash_list[sim_hash_str] = [
                        (sim_word, card_diff)
                        for sim_word in structures["ocrkeys"][sim_hash_str]
                        if edit_distance(sim_word, token) <= 2
                    ]

    for sim_hash_str, sim_list in sim_hash_list.items():
        for sim_word, card_diff in sim_list:
            sim_score = rate_ocr_key(structures["occurence_map"], token,
                                     sim_word, card_diff)

            if sim_score > 0:
                ocr_sims[sim_word] = sim_score

    return ocr_sims
示例#4
0
def select_ocrsims(token, structures):
    """Select similar words for a given token

    Parameters:
        token (:func:`str`): Cleaned token
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Similar words (keys) along with their score (values)
    """
    delta = 2
    ocr_sims = {}

    word_hash = ocr_key_hash(token)

    sim_hash_list = {}  # Using a dictionary avoid multiple entries if a key is retrieved twice
    key_index = -1

    # for (key, value) in word_hash:
    for key, value in word_hash:
        key_index += 1
        sim_hash = deepcopy(word_hash)

        for d in range(-delta, delta+1):
            if d != 0:
                card = max(int(value)+d, 1)

                sim_hash[key_index] = (key, card)

                # Rebuild OCR key string
                sim_hash_str = ""
                for k, v in sim_hash:
                    sim_hash_str += k + str(v)

                if sim_hash_str in structures["ocrkeys"]:
                    card_diff = abs(int(value)-card)

                    sim_hash_list[sim_hash_str] = [(sim_word, card_diff)
                                                   for sim_word in structures["ocrkeys"][sim_hash_str]
                                                   if edit_distance(sim_word, token) <= 2]

    for sim_hash_str, sim_list in sim_hash_list.items():
        for sim_word, card_diff in sim_list:
            sim_score = rate_ocr_key(structures["occurence_map"], token, sim_word, card_diff)

            if sim_score > 0:
                ocr_sims[sim_word] = sim_score

    return ocr_sims