Python ocr_key_hash 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: denoiser.models.inline.hashing

메소드/함수: ocr_key_hash

hotexamples.com에서의 예제들: 4

Python ocr_key_hash - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 denoiser.models.inline.hashing.ocr_key_hash에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: __init__.py 프로젝트: williammo2016/ocr-pipeline

    def append_data(self, unigrams):
        word_list = []

        aspell_dict = "models/aspell.en.dict"
        with open(aspell_dict, "r") as f:
            for line in f:
                word_list.append(line.strip("\r\n"))

        word_set = set(word_list)
        unigram_set = set(unigrams.keys())

        ocr_key_map = {
            ocr_key_list_to_str(ocr_key_hash(word)): set()
            for word in unigram_set.intersection(word_set)
        }

        # Every word contained in the mixed case map and the dictionary
        for word in unigram_set.intersection(word_set):
            h_list = ocr_key_hash(word)
            h_str = ocr_key_list_to_str(h_list)

            ocr_key_map[h_str].add(word)  # Add the word to the tab

        combine_struct = {
            key: set()
            for key in self.ocrkey_map.keys() + ocr_key_map.keys()
        }

        for key, value in self.ocrkey_map.items() + ocr_key_map.items():
            combine_struct[key] = combine_struct[key].union(value)

        self.ocrkey_map = combine_struct
        self.save()

예제 #2

파일 보기

파일: __init__.py 프로젝트: pdessauw/ocr-pipeline

    def append_data(self, unigrams):
        word_list = []

        aspell_dict = "models/aspell.en.dict"
        with open(aspell_dict, "r") as f:
            for line in f:
                word_list.append(line.strip("\r\n"))

        word_set = set(word_list)
        unigram_set = set(unigrams.keys())

        ocr_key_map = {ocr_key_list_to_str(ocr_key_hash(word)): set() for word in unigram_set.intersection(word_set)}

        # Every word contained in the mixed case map and the dictionary
        for word in unigram_set.intersection(word_set):
            h_list = ocr_key_hash(word)
            h_str = ocr_key_list_to_str(h_list)

            ocr_key_map[h_str].add(word)  # Add the word to the tab

        combine_struct = {key: set() for key in self.ocrkey_map.keys() + ocr_key_map.keys()}

        for key, value in self.ocrkey_map.items() + ocr_key_map.items():
            combine_struct[key] = combine_struct[key].union(value)

        self.ocrkey_map = combine_struct
        self.save()

예제 #3

파일 보기

파일: utils.py 프로젝트: williammo2016/ocr-pipeline

def select_ocrsims(token, structures):
    """Select similar words for a given token

    Parameters:
        token (:func:`str`): Cleaned token
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Similar words (keys) along with their score (values)
    """
    delta = 2
    ocr_sims = {}

    word_hash = ocr_key_hash(token)

    sim_hash_list = {
    }  # Using a dictionary avoid multiple entries if a key is retrieved twice
    key_index = -1

    # for (key, value) in word_hash:
    for key, value in word_hash:
        key_index += 1
        sim_hash = deepcopy(word_hash)

        for d in range(-delta, delta + 1):
            if d != 0:
                card = max(int(value) + d, 1)

                sim_hash[key_index] = (key, card)

                # Rebuild OCR key string
                sim_hash_str = ""
                for k, v in sim_hash:
                    sim_hash_str += k + str(v)

                if sim_hash_str in structures["ocrkeys"]:
                    card_diff = abs(int(value) - card)

                    sim_hash_list[sim_hash_str] = [
                        (sim_word, card_diff)
                        for sim_word in structures["ocrkeys"][sim_hash_str]
                        if edit_distance(sim_word, token) <= 2
                    ]

    for sim_hash_str, sim_list in sim_hash_list.items():
        for sim_word, card_diff in sim_list:
            sim_score = rate_ocr_key(structures["occurence_map"], token,
                                     sim_word, card_diff)

            if sim_score > 0:
                ocr_sims[sim_word] = sim_score

    return ocr_sims

예제 #4

파일 보기

파일: utils.py 프로젝트: pdessauw/ocr-pipeline

def select_ocrsims(token, structures):
    """Select similar words for a given token

    Parameters:
        token (:func:`str`): Cleaned token
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Similar words (keys) along with their score (values)
    """
    delta = 2
    ocr_sims = {}

    word_hash = ocr_key_hash(token)

    sim_hash_list = {}  # Using a dictionary avoid multiple entries if a key is retrieved twice
    key_index = -1

    # for (key, value) in word_hash:
    for key, value in word_hash:
        key_index += 1
        sim_hash = deepcopy(word_hash)

        for d in range(-delta, delta+1):
            if d != 0:
                card = max(int(value)+d, 1)

                sim_hash[key_index] = (key, card)

                # Rebuild OCR key string
                sim_hash_str = ""
                for k, v in sim_hash:
                    sim_hash_str += k + str(v)

                if sim_hash_str in structures["ocrkeys"]:
                    card_diff = abs(int(value)-card)

                    sim_hash_list[sim_hash_str] = [(sim_word, card_diff)
                                                   for sim_word in structures["ocrkeys"][sim_hash_str]
                                                   if edit_distance(sim_word, token) <= 2]

    for sim_hash_str, sim_list in sim_hash_list.items():
        for sim_word, card_diff in sim_list:
            sim_score = rate_ocr_key(structures["occurence_map"], token, sim_word, card_diff)

            if sim_score > 0:
                ocr_sims[sim_word] = sim_score

    return ocr_sims