def __init__(self):
        self.class_func = FuncMl()

        self.list_state_abbr = self.get_state_abbreviation_us()

        self.x_list = []
        self.y_list = []
Exemplo n.º 2
0
 def __init__(self):
     """
         class initial function
     """
     self.class_func = FuncMl()
     self.fix_json = self.class_func.load_json(
         os.path.join(self.class_func.my_dir, 'config/currency_fix.json'))
Exemplo n.º 3
0
    def __init__(self, mode='vendor', country='US'):
        self.class_func = FuncMl()
        self.class_field_extractor = FieldExtractor()
        self.class_text_extractor = TextExtractor()

        self.country = country
        self.mode = mode

        self.vendor_profile_list = self.class_func.load_vendor_profile(mode)
        if mode == 'vendor':
            self.vendor_profile = self.vendor_profile_list['VENDOR1']
        elif mode == 'passport':
            self.vendor_profile = self.vendor_profile_list['PASSPORT1']
Exemplo n.º 4
0
    def __init__(self):
        self.class_func = FuncMl()
        self.class_receipt_merchant = ReceiptMerchantExtractor()
        self.class_receipt_amount = ReceiptAmountExtractor()
        self.class_field_extractor = FieldExtractor()

        self.card_type = self.class_func.load_json(
            os.path.join(self.class_func.my_dir, 'config', 'card_type.json'))

        self.prev_ocr = None
        self.prev_name = None
        self.prev_type = None
        self.prev_address = None
        self.prev_remote_id = None
Exemplo n.º 5
0
 def __init__(self):
     """
         class initial function
     """
     self.class_func = FuncMl()
Exemplo n.º 6
0
class Proximity:

    def __init__(self):
        """
            class initial function
        """
        self.class_func = FuncMl()

    def get_position(self, dst_word, ocr_json, data_type, en_fuzzy=True):

        pos_list = []
        pos_list_temp = []

        if dst_word[0] == '(':
            dst_word = '( ' + dst_word[1:]

        if dst_word[-1] == ':':
            dst_word = dst_word[:-1] + ' :'
        elif dst_word[-1] == ')':
            dst_word = dst_word[:-1] + ' )'
        elif dst_word[-1] == '#':
            dst_word = dst_word[:-1] + ' #'
        elif dst_word[-1] == '.':
            dst_word = dst_word[:-1]

        dst_word_list = dst_word.split()

        # ---------------- check strict AND logic of dst words ------------------
        for i in range(1, len(ocr_json)-len(dst_word_list)+1):
            if i < len(ocr_json) - 1:
                comb_text = ocr_json[i]['description'].lower() + ' ' + ocr_json[i + 1]['description'].lower()
                if constant.no_hint_list.__contains__(comb_text):
                    continue

            f_match = True
            for j in range(len(dst_word_list)):
                if not ocr_json[i+j]['description'].lower() == dst_word_list[j].lower():
                    f_match = False
                    break

            if f_match:
                rect = self.get_rect(ocr_json[i]['boundingPoly']['vertices'])

                for j in range(1, len(dst_word_list)):
                    rect_next = self.get_rect(ocr_json[i+j]['boundingPoly']['vertices'])
                    rect = self.class_func.merge_rect(rect, rect_next)

                # check existence of next word
                rect_last = self.get_rect(ocr_json[i + len(dst_word_list) - 1]['boundingPoly']['vertices'])

                if len(ocr_json) > i + len(dst_word_list):
                    rect_last_next = self.get_rect(ocr_json[i + len(dst_word_list)]['boundingPoly']['vertices'])
                    word_last_next = ocr_json[i + len(dst_word_list)]['description']

                    if abs(rect_last_next[0] - rect_last[2]) < (rect[2] - rect[0]) / len(dst_word) * 2 and \
                            abs(rect_last_next[1] - rect_last[1]) < 10 and \
                            data_type == 'currency' and word_last_next.isalpha():
                        pos_list_temp.append(rect)
                    else:
                        pos_list.append(rect)

                else:
                    pos_list.append(rect)

        if not pos_list and pos_list_temp:
            pos_list = pos_list_temp

        # ---------------- check smooth AND logic of dst words ------------------
        if not pos_list:
            for i in range(1, len(ocr_json)-len(dst_word_list)):
                comb_text = ocr_json[i]['description'].lower() + ' ' + ocr_json[i + 1]['description'].lower()
                if constant.no_hint_list.__contains__(comb_text):
                    continue

                f_match = True
                for j in range(len(dst_word_list)):
                    if not ocr_json[i+j]['description'].lower().__contains__(dst_word_list[j].lower()):
                        f_match = False
                        break

                if f_match:
                    rect = self.get_rect(ocr_json[i]['boundingPoly']['vertices'])

                    for j in range(1, len(dst_word_list)):
                        rect_next = self.get_rect(ocr_json[i+j]['boundingPoly']['vertices'])
                        rect = self.class_func.merge_rect(rect, rect_next)

                    pos_list.append(rect)

        # -------------------- check fuzzy logic case ------------------------
        if en_fuzzy and not pos_list:
            max_match = 0
            for k in range(5):
                for i in range(1, len(ocr_json)-k-1):
                    comb_text = ocr_json[i]['description'].lower() + ' ' + ocr_json[i + 1]['description'].lower()
                    if constant.no_hint_list.__contains__(comb_text):
                        continue

                    text_comb = ''
                    for j in range(k):
                        text_comb += ocr_json[i+j]['description']
                        if j == 0:
                            rect = self.get_rect(ocr_json[i]['boundingPoly']['vertices'])
                        else:
                            rect_next = self.get_rect(ocr_json[i+j]['boundingPoly']['vertices'])
                            rect = self.class_func.merge_rect(rect, rect_next)

                    match = fuzz.ratio(text_comb.lower(), dst_word.lower())
                    if max_match < match:
                        max_match = match
                        max_pos = rect

            if max_match > 82:
                pos_list.append(max_pos)

        if pos_list:
            return pos_list
        else:
            return None

    @staticmethod
    def get_rect(parent):

        if 'x' in parent[0]:
            x1 = parent[0]['x']
        else:
            x1 = 0

        if 'y' in parent[0]:
            y1 = parent[0]['y']
        else:
            y1 = 0

        if 'x' in parent[2]:
            x2 = parent[2]['x']
        else:
            x2 = 0

        if 'y' in parent[2]:
            y2 = parent[2]['y']
        else:
            y2 = 0

        return [x1, y1, x2, y2]

    @staticmethod
    def expand_region(old_region, proximity_type, margin, rx, ry):

        [x1, y1, x2, y2] = old_region

        if proximity_type == 'above_text':
            nx1 = x1 - int(margin * (x2 - x1))
            nx2 = x2 + max(int(margin * (x2 - x1)), 130)
            ny1 = y2 + ry / 30  # for 5 pixel
            ny2 = 10000
        elif proximity_type == 'below_text':
            nx1 = x1 - int(margin * (x2 - x1))
            nx2 = x2 + int(margin * (x2 - x1))
            ny1 = 1
            ny2 = y1 - ry / 30
        elif proximity_type == 'same_line_prefix':
            nx1 = x2 + rx / 30
            nx2 = 10000
            ny1 = y1 - int(margin * (y2 - y1))
            ny2 = y2 + int(margin * (y2 - y1))
        else:
            nx1 = 1
            nx2 = x1 - rx / 30
            ny1 = y1 - int(margin * (y2 - y1))
            ny2 = y2 + int(margin * (y2 - y1))

        return [nx1, ny1, nx2, ny2]

    def get_data(self, ocr_json, proximity, data_type, rate_x, rate_y, en_fuzzy=True):

        hint_pos_list = []
        key_pos = []
        for i in range(len(proximity)):
            # ---------------------- Get the [proximity][text] region ---------------------------
            data_type_pos = self.get_position(proximity[i]['text'], ocr_json, data_type, en_fuzzy=en_fuzzy)
            if data_type_pos is None:
                continue

            # --------------- Expand the above region using [proximity][type] --------------------
            new_region1 = []
            new_region2 = []
            hint_pos1 = []
            hint_pos2 = []
            for j in range(len(data_type_pos)):
                # check if other words are in left and right margin of hint pos, and store it hint_pos1 and 2
                proximity_type = proximity[i]['type']
                hint_rect = data_type_pos[j]

                if proximity_type == 'same_line_suffix':
                    margin_rect = [hint_rect[2] + 2, hint_rect[1] + 2, hint_rect[2] + int(rate_x/5), hint_rect[3] - 2]
                else:
                    margin_rect = [hint_rect[0] - int(rate_x/4), hint_rect[1] + 2, hint_rect[0] - 2, hint_rect[3] - 2]

                f_overlap = False
                for k in range(1, len(ocr_json)):
                    if self.class_func.check_overlap_rect(margin_rect, self.class_func.get_rect_ocr_data(ocr_json, k)):
                        f_overlap = True
                        break

                if f_overlap:
                    hint_pos2.append([proximity[i]['type'], hint_rect])
                else:
                    hint_pos1.append([proximity[i]['type'], hint_rect])

                # expand region of hint pos
                if proximity_type == 'above_text' or proximity_type == 'below_text':
                    margin = 1.2
                else:
                    margin = 0.0

                region = self.expand_region(hint_rect, proximity_type, margin, rate_x, rate_y)
                if f_overlap:
                    new_region2.append(region)
                else:
                    new_region1.append(region)

            if len(hint_pos1) > 0:
                hint_pos_list.append(hint_pos1)
            else:
                hint_pos_list.append(hint_pos2)

            if len(new_region1) > 0:
                key_pos.append(new_region1)
            else:
                key_pos.append(new_region2)

        if len(key_pos) == 0:
            return '', '', '', ''

        # ----------------- Get new matching result from expanded region ---------------------
        new_text = ''
        new_text_list = []
        new_text_pos = []
        rect_prev = None

        for i in range(1, len(ocr_json)):
            pos = self.get_rect(ocr_json[i]['boundingPoly']['vertices'])

            if len(key_pos) == 1:       # ---------- in case of proximity hint is 1
                f_match = False
                for j in range(len(key_pos[0])):
                    if self.class_func.check_overlap_rect(key_pos[0][j], pos):
                        f_match = True
                        break

            elif len(key_pos) == 2:       # ---------- in case of proximity hint is more tan 2
                f_match = False
                for k1, k2 in itertools.product(range(len(key_pos[0])), range(len(key_pos[1]))):
                    if self.class_func.check_overlap_rect(key_pos[0][k1], pos) and \
                            self.class_func.check_overlap_rect(key_pos[1][k2], pos):
                        f_match = True
                        break

            elif len(key_pos) == 3:       # ---------- in case of proximity hint is more tan 3
                f_match = False
                for k1, k2, k3 in itertools.product(range(len(key_pos[0])), range(len(key_pos[1])),
                                                    range(len(key_pos[2]))):
                    if self.class_func.check_overlap_rect(key_pos[0][k1], pos) and \
                            self.class_func.check_overlap_rect(key_pos[1][k2], pos) and \
                            self.class_func.check_overlap_rect(key_pos[2][k3], pos):
                        f_match = True
                        break

            elif len(key_pos) == 4:       # ---------- in case of proximity hint is more tan 4
                f_match = False
                for k1, k2, k3, k4 in itertools.product(range(len(key_pos[0])), range(len(key_pos[1])),
                                                        range(len(key_pos[2])), range(len(key_pos[3]))):
                    if self.class_func.check_overlap_rect(key_pos[0][k1], pos) and \
                            self.class_func.check_overlap_rect(key_pos[1][k2], pos) and \
                            self.class_func.check_overlap_rect(key_pos[2][k3], pos) and \
                            self.class_func.check_overlap_rect(key_pos[3][k4], pos):
                        f_match = True
                        break

            else:
                f_match = False

            if f_match:
                text_item = ocr_json[i]['description']
                text_item_prev = ocr_json[max(i - 1, 0)]['description']
                rect = ocr_json[i]['boundingPoly']['vertices']

                x1 = self.class_func.get_field_int(rect[0], 'x')
                y1 = self.class_func.get_field_int(rect[0], 'y')

                if rect_prev is None:
                    x2 = 0
                    y2 = 0
                else:
                    x2 = self.class_func.get_field_int(rect_prev[1], 'x')
                    y2 = self.class_func.get_field_int(rect_prev[0], 'y')

                rect_prev = rect

                if abs(x1 - x2) < rate_x / 5 and abs(y1 - y2) < rate_y / 30:
                    if text_item == '/' or (len(new_text) > 0 and new_text[-1] == '/'):
                        new_text += text_item
                    elif text_item == '-' or (len(new_text) > 0 and new_text[-1] == '-'):
                        new_text += text_item
                    elif text_item == '.' and len(text_item_prev) == 1 and text_item_prev.isupper():     # ignore "P ."
                        continue
                    elif text_item == '.' or (len(new_text) > 0 and new_text[-1] == '.'):
                        new_text += text_item
                    elif text_item == ',' or (len(new_text) > 0 and new_text[-1] == ','):
                        new_text += text_item
                    elif len(new_text) > 0 and new_text[-1] == '$':
                        new_text += text_item
                    else:
                        new_text += (' ' + text_item)
                else:
                    if new_text == '':
                        new_text += text_item
                    else:
                        new_text += ('\n' + text_item)

                new_text_list.append(text_item)
                new_text_pos.append(pos)

        return new_text, hint_pos_list, new_text_list, new_text_pos
Exemplo n.º 7
0
class CurrencyExtractor:
    def __init__(self):
        """
            class initial function
        """
        self.class_func = FuncMl()
        self.fix_json = self.class_func.load_json(
            os.path.join(self.class_func.my_dir, 'config/currency_fix.json'))

    def is_currency(self, text):

        # case of ocred incorrectly "," into "." such as    '$32.452.10',   => '$32452.10'
        #                           "." into "," such as    '$32,45'        => '$34.45'
        #                                                   '$164.04.'      => '$164.04'
        #                                                   '16.404:12USD'  => '$16404.12'

        # ------------------------ text pre-processing ------------------------
        text = text.replace(' ', '')

        if text[-3:] == 'USD':
            text = '$' + text[:len(text) - 3].replace(':', '.')
        elif text[:3] == 'USD':
            text = '$' + text[3:len(text)]

        pos_dot = []
        for i in range(len(text) - 1):
            if text[i] == '.' or text[i] == ',':
                pos_dot.append(i)

        if len(pos_dot) > 0:
            if len(text[pos_dot[-1] + 1:]) != 3 and text[pos_dot[-1]] == ',':
                char_last = '.'
            else:
                char_last = text[pos_dot[-1]]

            if char_last == '.':
                text = text[:pos_dot[-1]].replace(',', '').replace(
                    '.', '') + char_last + text[pos_dot[-1] + 1:]
            else:
                text = text.replace(',', '').replace('.', '')

        # ----------------- Get valid text using '$' character ----------------
        #       ab$13.2 -> $13.2,         -$7.6 -> -$7.6
        # ---------------------------------------------------------------------
        if text.__contains__('$'):
            dollar_pos = text.find('$')
            if dollar_pos > 0 and text[dollar_pos - 1] == '-':
                text = text[dollar_pos - 1:]
            else:
                text = text[dollar_pos:]

        if text == '':
            return None

        # ------------------ Decide of positive or negative -------------------
        f_pos = False
        if len(text) > 2:
            if text[0] == '(' and text[-1] == ')':
                currency_data = text[1:-1]
            elif text[1] == '(' and text[-1] == ')':
                currency_data = text[0] + text[2:-1]
            elif text[-2:].upper() == 'CR':
                currency_data = text[:-2]
            elif text[0] == '-':
                currency_data = text[1:]
            elif text[-1] == '-':
                currency_data = text[:-1]
            else:
                currency_data = text
                f_pos = True
        else:
            currency_data = text
            f_pos = True

        # -------------------- Check existence dollar mark --------------------
        f_dollar = True
        if currency_data[0] == '$':
            currency_data = currency_data[1:]
            for i in range(len(currency_data)):
                if not currency_data[-1].isdigit():
                    currency_data = currency_data[:-1]
                else:
                    break
        elif currency_data[0] == 'S':
            currency_data = currency_data[1:]
        else:
            f_dollar = False

        # ------------------------- Convert to value --------------------------
        try:
            # some error correction ('99.G8 -> 99.08')
            if currency_data.__contains__('.'):
                for fix_key in self.fix_json:
                    if currency_data.count(str(fix_key)) == 1:
                        currency_data = currency_data.replace(
                            str(fix_key), str(self.fix_json[fix_key]))

            ret = round(float(currency_data), 2)
            if not f_pos:
                ret = -ret

            ret = '{:.2f}'.format(ret)

            return [f_dollar, ret]

        except ValueError:
            return None

    def extract(self, text):

        text_list = text.split('\n')
        result_value = []
        result_value_dollar = []

        for i in range(len(text_list)):
            # ---------------- for single line --------------------
            ret = self.is_currency(text_list[i])
            if ret is not None:
                if ret[0]:
                    result_value_dollar.append(ret[1])
                else:
                    result_value.append(ret[1])

                continue

            # ------------------ for single word -------------------
            word_list = text_list[i].split()
            if len(word_list) == 1 or word_list.__contains__(
                    '%'):  # ignore `5.4%`, `(7%)`
                continue

            for j in range(len(word_list)):
                ret = self.is_currency(word_list[j])
                if ret is not None:
                    if ret[0]:
                        result_value_dollar.append(ret[1])
                    else:
                        result_value.append(ret[1])

        return result_value_dollar + result_value
Exemplo n.º 8
0
from Categorization import VendorExtractor
from Categorization import FuncMl
import sys
import os

if len(sys.argv) >= 2:
    src_name = sys.argv[1]
else:
    src_name = 'sample_passport/p2.jpg'

class_passport = VendorExtractor('passport')
class_func = FuncMl()

ret = class_passport.extract_MRP(src_name)
if not ret:
    ret = class_passport.extract(src_name)
    # ret = class_passport.extract(src_name, 9)

ret_parse = {}
for i in range(len(ret)):
    ret_parse[ret[i]['field_name']] = ret[i]['value']

if ret_parse['Date of Birth'] == ret_parse['Date of Issue']:
    ret_parse['Date of Birth'] = None

# --------------- Save and Display Result -------------------
str_path = os.path.split(src_name)

data = [['Name', 'Value']]

for key in sorted(ret_parse.iterkeys()):
Exemplo n.º 9
0
 def __init__(self, country='US'):
     self.class_func = FuncMl()
     self.currency_extractor = CurrencyExtractor()
     self.date_extractor = DateExtractor()
     self.country = country
class ReceiptAddressExtractorML:
    def __init__(self):
        self.class_func = FuncMl()

        self.list_state_abbr = self.get_state_abbreviation_us()

        self.x_list = []
        self.y_list = []

    def get_state_abbreviation_us(self):
        us_postal_data = self.class_func.load_csv(
            os.path.join(self.class_func.my_dir, 'config/us_postal_codes.csv'))
        list_state_abbr = []
        for i in range(1, len(us_postal_data)):
            list_state_abbr.append(us_postal_data[i][3].lower())

        list_state_abbr.append('california')

        return self.class_func.remove_duplicate(list_state_abbr)

    def __mark_address_line(self, text_lines):
        """
            Check and mark the address lines
        """
        mark_list = []
        key_address = [
            'street', 'road', 'rd', 'ave', 'st', 'way', 'city', 'dr', 'ste',
            'floor', 'station', 'airport', 'mall', 'center', 'blvd',
            'expressway'
        ]
        no_key_list = [':', '%', ' ID.', '.com', 'Visa ending']

        # for i in range(min(len(text_lines), 10)):
        for i in range(1, len(text_lines)):
            text_line = text_lines[i].lower()
            word_line = text_line.replace(',', ' ').split()
            f_digit = False
            f_key1 = False
            f_key2 = False

            # Check no_key
            f_no = False
            for no_key in no_key_list:
                if text_lines[i].__contains__(no_key):
                    f_no = True
                    break

            if f_no:
                continue

            # Check address keys
            for j in range(len(word_line)):
                if word_line[j].replace('-', '').isdigit():
                    f_digit = True

                if key_address.__contains__(
                        word_line[j].strip('.').strip(',')):
                    f_key1 = True

                if self.list_state_abbr.__contains__(
                        word_line[j].strip('.').strip(',')):
                    f_key2 = True

            if f_key1:  # `First Street White Luncheon Napkins 1500 ch`
                for j in range(len(word_line)):
                    if word_line[j].isdigit():
                        break
                    if j >= 4:
                        f_key1 = False

            if not f_key2:
                for k in range(len(self.list_state_abbr)):
                    if len(self.list_state_abbr[k].split()) > 1:
                        if text_line.__contains__(
                                self.list_state_abbr[k].lower()):
                            f_key2 = True

            if f_key1:
                mark_list.append(i)
            elif f_key2:
                if f_digit:
                    mark_list.append(i)
                elif i + 1 < len(text_lines) and text_lines[
                        i + 1].isdigit() and 3 < len(text_lines[i + 1]) < 6:
                    mark_list.append(i)
                    mark_list.append(i + 1)

        # check previous line if address is 1 line:  '1298 Montague Expw \nSan Jose CA 95131'
        if len(mark_list) == 1:
            if mark_list[0] > 0:
                temp_line = text_lines[mark_list[0] - 1]
                temp_word = temp_line.split()
                if temp_word[0].isdigit() and len(temp_word) >= 3:
                    mark_list.insert(0, mark_list[0] - 1)

        return mark_list

    def __detect_line_item_section(self, ocr_json):
        # ------------------- Get rect of individual words --------------------
        rects = []
        total_height = 0
        for i in range(1, len(ocr_json)):
            word_rect = self.class_func.get_rect_ocr_data(ocr_json, i)
            total_height += (word_rect[3] - word_rect[1])
            rects.append(word_rect)

        # ------------------- Merge rects of top/bottom words -----------------
        char_h = int(total_height / len(ocr_json) / 2)

        while True:
            f_merge = False
            for i in range(len(rects) - 20):
                for j in range(i + 1, i + 20):
                    if abs(rects[i][0] - rects[j][0]) < char_h and abs(rects[i][2] - rects[j][2]) < char_h and \
                            (abs(rects[i][1] - rects[j][3]) < char_h or abs(rects[i][3] - rects[j][1]) < char_h):
                        rects[i] = self.class_func.merge_rect(
                            rects[i], rects[j])
                        rects.pop(j)
                        f_merge = True
                        break

                if f_merge:
                    break

            if not f_merge:
                break

        # ------------------- Detect the long height rects --------------------
        long_rect = []
        for i in range(len(rects)):
            if rects[i][3] - rects[i][1] > char_h * 10:
                long_rect.append(rects[i])

        if len(long_rect) > 2:
            for i in range(len(long_rect)):
                same_line_cnt = 0
                for j in range(len(long_rect)):
                    if abs(long_rect[i][1] - long_rect[j][1]) < char_h:
                        same_line_cnt += 1

                if same_line_cnt > 2:
                    line_data = self.class_func.get_line_rect(ocr_json)

                    for j in range(len(line_data['rect'])):
                        if line_data['rect'][j][1] >= long_rect[i][3]:
                            return j - 1

            return -1

        else:
            return -1

    def get_features_address(self, ocr_json):

        new_ocr_text = self.class_func.remove_vertical_text(ocr_json)
        text_lines = new_ocr_text.splitlines()

        address_line_list = self.__mark_address_line(text_lines)
        mark_address_list = []

        if address_line_list:
            mark_start = address_line_list[0]
            mark_len = 1
            text_address = text_lines[address_line_list[0]]

            for i in range(1, len(address_line_list)):
                if address_line_list[i] == address_line_list[i - 1] + 1:
                    mark_len += 1
                    text_address += ' ' + text_lines[address_line_list[i]]
                else:
                    mark_address_list.append(
                        [mark_start, mark_len, text_address])
                    mark_start = address_line_list[i]
                    mark_len = 1
                    text_address = text_lines[address_line_list[i]]

            if 10 < len(text_address) < 60:
                mark_address_list.append([mark_start, mark_len, text_address])

        return mark_address_list

    def extractor(self, ocr_json):

        mark_address_list = self.get_features_address(ocr_json)
        first_address_line = -1

        if mark_address_list:
            first_address_line = mark_address_list[0][0]
            ret_address = mark_address_list[0][2]

        else:
            ret_address = ''

        return ret_address, first_address_line
Exemplo n.º 11
0
class DateExtractor:
    def __init__(self):
        """
            class initial function
        """
        self.class_func = FuncMl()

    @staticmethod
    def get_pre_digit(text, reverse=False):

        if text == '':
            return text

        if reverse:
            text = text[::-1]

        if not text.isdigit() and text[0].isdigit():
            i = 0
            for i in range(len(text)):
                if not text[i].isdigit():
                    break

            text = text[:i]

        if reverse:
            text = text[::-1]

        return text

    @staticmethod
    def split_text(text):
        """
            Split the text as digital and chars. for example:
            '14Jul18' => ['14', 'Jul', '18']
        """
        char_prev_st = 0
        char_list = ''
        txt_date = []
        for i in range(len(text)):
            if text[i].isdigit():
                char_st = 1
            elif text[i].isalpha():
                char_st = 2
            else:
                char_st = 3

            if char_prev_st != char_st and char_list != '':
                txt_date.append(char_list)
                char_list = ''

            char_list += text[i]
            char_prev_st = char_st

        txt_date.append(char_list)

        return txt_date

    def is_date(self, text, country='US'):

        # if text == 'Jun16/18':
        #     print "1"

        month_list = constant.month_list
        if country == 'NL':
            month_list_full = constant.month_list_full_NL
        else:
            month_list_full = constant.month_list_full

        score = 10

        if text.__contains__('/'):
            txt_date = text.split('/')
        elif text.__contains__('-'):
            txt_date = text.split('-')
        elif text.__contains__('.') or text.__contains__(','):
            txt_date = text.replace('.', ' ').replace(',', ' ').split()
            score -= 1
        elif text.__contains__("'"):
            txt_date = text.split("'")
        else:
            txt_date = self.split_text(text)

        if len(txt_date) == 1:
            score -= 2
            if txt_date[0].isdigit():
                if len(text) == 6:
                    txt_date[0] = text[0]
                    txt_date.append(text[1])
                    txt_date.append(text[2:])
                elif len(text) == 7:
                    if text[0] == '0':
                        txt_date[0] = text[0:2]
                        txt_date.append(text[2])
                    else:
                        txt_date[0] = text[0]
                        txt_date.append(text[1:3])
                    txt_date.append(text[3:])
                elif len(text) == 8:
                    txt_date[0] = text[0:2]
                    txt_date.append(text[2:4])
                    txt_date.append(text[4:])
                elif len(text) == 10:
                    txt_date[0] = text[0:2]
                    txt_date.append(text[3:5])
                    txt_date.append(text[6:])
                else:
                    return None
            else:
                if len(text) == 10:  # 'DECO1,2017'
                    if text[0:3].isalpha() and text[3:5].replace('O', '0').isdigit() and \
                            text[6:].isdigit() and text[5] == ',':
                        score -= 1
                        txt_date[0] = text[0:3]
                        txt_date.append(text[3:5].replace('O', '0'))
                        txt_date.append(text[6:])
                    else:
                        return None
                else:
                    return None
        elif len(txt_date) == 2:
            score -= 1
            temp1 = txt_date[0]
            temp2 = txt_date[1]

            if txt_date[0].isdigit():
                if len(temp2) <= 4:
                    if len(temp1) == 4:  # 0722 => 07, 22
                        txt_date[0] = temp1[0:2]
                        txt_date[1] = temp1[2:]
                    elif len(temp1) == 3:  # 072 => 07, 2
                        txt_date[0] = temp1[0:2]
                        txt_date[1] = temp1[2]
                    elif len(temp1) == 2:  # 72 => 7, 2
                        txt_date[0] = temp1[0]
                        txt_date[1] = temp1[1]
                    elif len(
                            temp1) == 5 and temp1[2] == '1':  # 07l22 => 07, 22
                        txt_date[0] = temp1[0:2]
                        txt_date[1] = temp1[3:]
                    else:
                        return None

                    txt_date.append(temp2)
                else:
                    txt_date[0] = temp1
                    if len(temp2) == 6:  # '11 212017' => '11,21,2017'
                        txt_date[1] = temp2[0:2]
                        txt_date.append(temp2[2:])
                    else:
                        return None
            elif txt_date[0] != '' and txt_date[0][0].isalpha(
            ) and txt_date[0][-1].isdigit():  # 'May04'
                for i in range(len(temp1)):
                    if temp1[i].isdigit():
                        t1 = temp1[:i].lower()
                        t2 = temp1[i:]
                        if (month_list_full.__contains__(t1) or
                                month_list.__contains__(t1)) and t2.isdigit():
                            score += 1
                            txt_date[0] = t1
                            txt_date[1] = t2
                            txt_date.append(temp2)
                            break
                        else:
                            return None

                if len(txt_date) == 2:
                    return None

            else:
                return None

        elif len(txt_date) != 3:
            return None

        txt_date[0] = txt_date[0].split(':')[-1]

        # ----------- collecting digit from mix of digit and string -----------
        txt_date[0] = self.get_pre_digit(txt_date[0], reverse=True)
        txt_date[2] = self.get_pre_digit(txt_date[2])

        if txt_date[1] == '1st':
            txt_date[1] = '1'
        elif txt_date[1] == '2nd':
            txt_date[1] = '2'
        elif txt_date[1] == '3rd':
            txt_date[1] = '3'
        elif txt_date[1][:-2].isdigit() and txt_date[1][-2:] == 'th':
            txt_date[1] = txt_date[1][:-2]

        # ----------------------------- extract the date --------------------------
        data_month = 0
        data_day = 0

        if len(txt_date[0]) == 4 and len(txt_date[1]) == 2 and len(
                txt_date[2]) == 2:  # case of yyyy/mm/dd
            if txt_date[0].isdigit() and txt_date[1].isdigit(
            ) and txt_date[2].isdigit():
                data_year = int(txt_date[0])
                data_month = int(txt_date[1])
                data_day = int(txt_date[2])

                if not (1900 < data_year < 2100 and 1 <= data_month <= 12
                        and 1 <= data_day <= 31):
                    return None

            else:
                return None

        else:
            # -------------------------- Checking of year -------------------------
            if txt_date[2].isdigit():
                if 1900 <= int(txt_date[2]) <= 2100:
                    data_year = int(txt_date[2])
                elif len(txt_date[2]) == 2 and int(txt_date[2]) < 50:
                    data_year = int(txt_date[2]) + 2000
                    score -= 1
                else:
                    return None
            elif txt_date[2][:-1].isdigit():
                score -= 1
                if 1900 <= int(txt_date[2][:-1]) <= 2100:
                    data_year = int(txt_date[2][:-1])
                elif len(txt_date[2][:-1]) == 2 and int(txt_date[2][:-1]) < 50:
                    data_year = int(txt_date[2][:-1]) + 2000
                    score -= 1
                else:
                    return None
            else:
                return None

            find_md = True
            if txt_date[1].isdigit():  # case of m/d/y
                # -------------------------- Checking of day -------------------------
                if 0 < int(txt_date[1]) <= 31:
                    data_day = int(txt_date[1])
                else:
                    find_md = False

                # -------------------------- Checking of month -------------------------
                if txt_date[0].isdigit():
                    if 0 < int(txt_date[0]) <= 12 and find_md:
                        if data_day < 13 and (country == 'INDIA'
                                              or country == 'NL'):
                            data_month = data_day
                            data_day = int(txt_date[0])
                        else:
                            data_month = int(txt_date[0])
                    else:
                        find_md = False
                elif month_list.__contains__(txt_date[0].lower()):
                    data_month = month_list.index(txt_date[0].lower()) + 1
                    score += 1
                elif month_list_full.__contains__(txt_date[0].lower()):
                    data_month = month_list_full.index(txt_date[0].lower()) + 1
                    score += 1
                else:
                    find_md = False
            else:
                find_md = False

            if not find_md:  # case of d/m/y
                if txt_date[0].isdigit():
                    # -------------------------- Checking of day -------------------------
                    if 0 < int(txt_date[0]) <= 31:
                        data_day = int(txt_date[0])
                    else:
                        return None

                    # -------------------------- Checking of month -------------------------
                    if txt_date[1].isdigit() and 0 < int(txt_date[1]) <= 12:
                        data_month = int(txt_date[1])
                    elif month_list.__contains__(txt_date[1].lower()):
                        data_month = month_list.index(txt_date[1].lower()) + 1
                    elif month_list_full.__contains__(txt_date[1].lower()):
                        data_month = month_list_full.index(
                            txt_date[1].lower()) + 1
                    else:
                        return None

                else:
                    return None

        if country == 'NL':
            # ret = '%02d/%02d/%04d' % (data_day, data_month, data_year)
            ret = '%04d-%02d-%02d' % (data_year, data_month, data_day)
        else:
            ret = '%02d/%02d/%04d' % (data_month, data_day, data_year)

        return [ret, score]

    def extract(self, text, country='US'):
        text_line_list = text.replace("'", ' ').splitlines()
        result_value = []
        result_score = []

        for comb_lines in range(2):
            # -------------- First check one lines, and if not detect, check combination of 2 lines ------------
            for line_ind in range(len(text_line_list) - comb_lines):
                if comb_lines == 0:
                    text_list = text_line_list[line_ind].split()
                elif comb_lines == 1:
                    text_list = text_line_list[line_ind].split(
                    ) + text_line_list[line_ind + 1].split()
                else:
                    text_list = []

                # ---------------- for single word --------------------
                for text_item in text_list:
                    ret = self.is_date(text_item, country)
                    if ret is not None:
                        result_value.append(ret[0])
                        result_score.append(ret[1])

                # ---------------- for combination of 2 words('04/20/ 2017') --------------------
                for i in range(len(text_list) - 1):
                    ret = self.is_date(text_list[i] + ' ' + text_list[i + 1],
                                       country)
                    if ret is not None:
                        result_value.append(ret[0])
                        result_score.append(ret[1])

                # ---------------- for combination of 2 words('Sep 1,2017') --------------------
                for i in range(len(text_list) - 1):
                    ret = self.is_date(
                        text_list[i] + '/' +
                        text_list[i + 1].replace(',', '/'), country)
                    if ret is not None:
                        result_value.append(ret[0])
                        result_score.append(ret[1])

                # ----------- for combination of 3 words ---------------
                for i in range(len(text_list) - 2):

                    new_text = self.class_func.text_clean(text_list[i]) + '/' + \
                               self.class_func.text_clean(text_list[i+1]) + '/' + \
                               self.class_func.text_clean(text_list[i+2])

                    ret = self.is_date(new_text, country)
                    if ret is not None:
                        result_value.append(ret[0])
                        result_score.append(ret[1])

            if len(result_value) > 0:
                max_score = max(result_score)
                ret_date = []
                for i in range(len(result_score)):
                    if result_score[i] == max_score > 7:
                        ret_date.append(result_value[i])

                # error correction
                if len(ret_date) == 2:  # ['6/13/2018', '6/13/2013']
                    if ret_date[0][:-1] == ret_date[1][:-1]:
                        if ret_date[0][-1] == '8' and ret_date[1][-1] == '3':
                            ret_date = [ret_date[0][:-1] + '8']

                return ret_date

        return []
Exemplo n.º 12
0
    def __init__(self):
        self.class_field_extractor = FieldExtractor()
        self.class_func = FuncMl()

        self.amount_key_list, self.amount_no_key_list = self.class_func.load_receipt_amt_keys(
        )
Exemplo n.º 13
0
class ReceiptAmountExtractor:
    def __init__(self):
        self.class_field_extractor = FieldExtractor()
        self.class_func = FuncMl()

        self.amount_key_list, self.amount_no_key_list = self.class_func.load_receipt_amt_keys(
        )

    def extract_receipt_amount_keys(self, filename, ocr_json=None):
        # ---------------------------- Get OCR json -------------------------------
        if ocr_json is None:
            ocr_json = self.class_func.get_json_google_from_jpg(filename)

        if ocr_json is None:
            return [], []

        detect_keys = []
        key_words = 0

        for i in range(1, len(ocr_json) - 2):
            if ocr_json[i]['description'] == '' or ocr_json[i]['description'][
                    0].islower():
                continue

            # ------------- Get Candidate of amount keys and it's position ------------
            text1 = ocr_json[i]['description'].lower()
            text2 = text1 + ' ' + ocr_json[i + 1]['description'].lower()
            text3 = text2 + ' ' + ocr_json[i + 2]['description'].lower()
            text_pos1 = self.class_func.get_rect_ocr_data(ocr_json, i)
            text_pos2 = self.class_func.get_rect_ocr_data(ocr_json, i + 1)
            text_pos3 = self.class_func.get_rect_ocr_data(ocr_json, i + 2)

            if key_words > 1:
                key_words -= 1
                continue

            if text2 in self.amount_no_key_list:
                key_words = 2
                continue
            elif text3 in self.amount_no_key_list:
                key_words = 3
                continue

            if text3 in self.amount_key_list and self.class_func.check_same_line(text_pos1, text_pos2) and \
                    self.class_func.check_same_line(text_pos2, text_pos3):
                amount_key = text3
                key_words = 3
            elif text2 in self.amount_key_list and self.class_func.check_same_line(
                    text_pos1, text_pos2):
                amount_key = text2
                key_words = 2
            elif text1 in self.amount_key_list:
                amount_key = text1
                key_words = 1
            else:
                continue

            # ------------------------ Get value of candidates ------------------------
            profile_hash = {u'width': 8.5, u'height': 11}
            key_y1 = text_pos1[1] - 200
            key_y2 = text_pos1[3] + 200
            hint_hash = {
                u'field_id':
                2,
                u'data_type':
                u'currency',
                u'hints': [[[
                    u'coordinates', {
                        u'x_1': 0.0,
                        u'x_2': 8.5,
                        u'y_1': key_y1,
                        u'y_2': key_y2
                    }
                ],
                            [
                                u'proximity', {
                                    u'text': amount_key,
                                    u'type': u'same_line_prefix'
                                }
                            ]]]
            }

            ret_val = self.class_field_extractor.extract_v2(
                filename, profile_hash, hint_hash, ocr_json_data=ocr_json)
            amount_value = ret_val[0]['value']

            if amount_value is None:
                continue
            elif amount_value == 0 and amount_key == 'total':
                continue

            # ------------- Get number of words on left and right of value ------------
            key_x1 = text_pos1[0]
            if key_words == 3:
                key_x2 = text_pos3[2]
            elif key_words == 2:
                key_x2 = text_pos2[2]
            else:
                key_x2 = text_pos1[2]

            # ----- find all same line words ------
            center_line = int((text_pos1[1] + text_pos1[3]) / 2)

            line_word_text = []
            line_word_pos = []

            for j in range(1, len(ocr_json) - 2):
                word_pos = self.class_func.get_rect_ocr_data(ocr_json, j)

                if word_pos[1] < center_line < word_pos[3]:
                    word_text = ocr_json[j]['description'].lower()
                    line_word_text.append(word_text)
                    line_word_pos.append(word_pos)

            # ----- get x position of value word -----
            value_x1 = 0
            value_x2 = 0

            for j in range(len(line_word_text)):
                if line_word_pos[j][0] < key_x2:
                    continue

                if line_word_text[j].isdigit() and int(
                        line_word_text[j]) == int(amount_value):
                    value_x1 = line_word_pos[j][0]
                    value_x2 = line_word_pos[j][2]

                    if j < len(line_word_pos) - 2 and line_word_text[j + 2].isdigit() and \
                            (line_word_text[j + 1] == '.' or line_word_text[j + 1] == ',') and \
                            float(line_word_text[j] + '.' + line_word_text[j + 2]) == float(amount_value):
                        value_x2 = line_word_pos[j + 2][2]
                        break

            # ----- count the words -----
            cnt1 = 0
            cnt2 = 0
            cnt3 = 0
            for j in range(len(line_word_text)):
                if not (line_word_text[j].isdigit()
                        or line_word_text[j].isalpha()):
                    continue
                if line_word_pos[j][2] < key_x1:
                    cnt1 += 1
                elif key_x2 < line_word_pos[j][0] < value_x1:
                    cnt2 += 1
                elif value_x2 < line_word_pos[j][0]:
                    cnt3 += 1

            detect_keys.append([
                amount_key, amount_value, text_pos1, [cnt1, cnt2, cnt3],
                [0, 0, 0, 0]
            ])

        # ----------------------- Get relative position of keys -----------------------
        for i in range(len(detect_keys) - 1):
            if self.class_func.check_same_line(detect_keys[i][2],
                                               detect_keys[i + 1][2]):
                detect_keys[i][4][0] = i + 2
                detect_keys[i + 1][4][1] = i + 1
            else:
                detect_keys[i][4][2] = i + 2
                detect_keys[i + 1][4][3] = i + 1

        # ------------------------ Create key-value dictionary ------------------------
        dict_val = {}
        dict_temp = {}

        for i in range(len(detect_keys)):
            key_name = detect_keys[i][0]

            if key_name in dict_val:
                if dict_temp[key_name] > sum(detect_keys[i][3]):
                    dict_val[key_name] = detect_keys[i][1]
                    dict_temp[key_name] = sum(detect_keys[i][3])
            else:
                dict_val[key_name] = detect_keys[i][1]
                dict_temp[key_name] = sum(detect_keys[i][3])

        # ---------------------------- Create feature list ----------------------------
        feature_list = []

        for i in range(len(detect_keys)):
            feature_list.append(
                [detect_keys[i][0], detect_keys[i][3], detect_keys[i][4]])

        return dict_val, feature_list

    def extract(self, filename, ocr_json=None):

        key_val_list, key_data = self.extract_receipt_amount_keys(
            filename, ocr_json)

        if not key_data:
            return None

        for amount_key in self.amount_key_list:
            if amount_key in key_val_list:
                return key_val_list[amount_key]

        return None
Exemplo n.º 14
0
class ReceiptMerchantExtractor:

    def __init__(self):
        self.google_ocr = GoogleOCR()
        self.class_func = FuncMl()
        self.class_date_extractor = DateExtractor()
        self.class_address_extractor = ReceiptAddressExtractorML()

    @staticmethod
    def __mark_merchant_line(text_lines, line_rect_list):
        """
            Check and mark the Merchant lines
        """
        merchant_rect = None
        name_line_list = []
        page_rect = line_rect_list['rect'][0]
        list_no_name = ['welcome', 'thank you', 'customer', 'copy', 'only', '*', 'ticket',
                        '(', ')', ':', 'invoice', '!', 'more', 'congratulation', 'bill']

        for i in range(len(text_lines)):
            # pre-processing of text line
            for j in range(i + 1, len(line_rect_list['text'])):
                if text_lines[i] == line_rect_list['text'][j]:
                    break

            line_rect = line_rect_list['rect'][j]

            text_lines[i] = text_lines[i].replace('Welcome to', '')
            text_lines[i] = text_lines[i].strip('-')

            # check contains of key in list_no_name
            f_check_no_list = False
            for j in range(len(list_no_name)):
                if text_lines[i].lower().__contains__(list_no_name[j]):
                    f_check_no_list = True
                    break

            if f_check_no_list:
                continue

            # check validation of key
            if len(text_lines[i]) <= 2:
                continue
            elif len(name_line_list) > 0 and name_line_list[-1] + 1 != i:
                break
            elif len(name_line_list) > 0 and text_lines[i].__contains__(text_lines[name_line_list[-1]]):
                continue
            elif len(name_line_list) > 2:
                continue
            elif len(name_line_list) > 1 and not text_lines[i].isupper():
                continue
            elif text_lines[i][0] == '#':
                continue
            elif len(CommonRegex(text_lines[i]).dates) > 0:
                continue
            elif len(CommonRegex(text_lines[i]).phones) > 0:
                continue
            elif len(CommonRegex(text_lines[i]).links) > 0:
                continue
            elif len(text_lines[i].replace('@', '').replace('&', '').split()) > 5:
                continue
            elif len(text_lines[i].split()) > 3 and text_lines[i].__contains__('-'):
                continue
            elif text_lines[i].replace('-', '').replace(' ', '').isdigit():  # '305337 - 1'
                continue
            elif len(name_line_list) > 0 and line_rect[1] > 2 * merchant_rect[3] - merchant_rect[1]:
                continue
            elif (line_rect[0] + line_rect[2]) > (page_rect[0] + page_rect[2]) * 1.3:   # check the position
                continue

            name_line_list.append(i)
            merchant_rect = line_rect

        return name_line_list

    def get_address_string(self, ocr_json):

        # --------------------------- Extract Address -------------------------
        ret_address, first_address_line = self.class_address_extractor.extractor(ocr_json)

        # ------------------------ Remove vertical text -----------------------
        line_data = self.class_func.get_line_rect(ocr_json)
        new_ocr_text = self.class_func.remove_vertical_text(ocr_json)
        text_lines = new_ocr_text.splitlines()

        # --------------------------- Extract Merchant ------------------------
        if first_address_line == -1:
            range_merchant = 5
        else:
            range_merchant = first_address_line

        name_line_list = self.__mark_merchant_line(text_lines[:range_merchant], line_data)

        ret_name = ''
        for i in range(len(name_line_list)):
            ret_name += text_lines[name_line_list[i]] + ' '

        if ret_name.__contains__('#') and len(ret_name.split()) > 2:
            ret_name = ret_name[:ret_name.find('#')]

        ret_name = ret_name.strip('.').strip().replace('  ', ' ').replace('&', '')

        if len(ret_name) > 1 and ret_name[0].islower() and ret_name[1:].isupper():
            ret_name = ret_name.upper()

        return [ret_name, ret_address]

    def get_request_google(self, key_string):
        """
            get the respond from google text search request using key_string.
        """
        if key_string is None:
            return None

        response = requests.post(url='https://maps.googleapis.com/maps/api/place/textsearch/json?key=' +
                                     self.class_func.google_key + '&query=' + key_string,
                                 headers={'Content-Type': 'application/json'})

        return response.text

    def get_request_google_nearby(self, position):
        """
            get the respond from google text search request using key_string.
        """
        url_info = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json?location=' + \
                   str(position[0]) + ',' + str(position[1]) + '&key=' + self.class_func.google_key + \
                   '&radius=500'
        response = requests.post(url=url_info, headers={'Content-Type': 'application/json'})

        return response.text

    def get_request_google_place_detail(self, place_id):
        """
            get the respond from google place details request using key_string.
        """
        url_info = 'https://maps.googleapis.com/maps/api/place/details/json?placeid=' + place_id + \
                   '&key=' + self.class_func.google_key

        response = requests.post(url=url_info, headers={'Content-Type': 'application/json'})

        return response.text

    def extract(self, filename=None, ocr_json=None):
        """
            get some google search data from ocr json data.
        """
        if ocr_json is None:
            ocr_json = self.google_ocr.get_json_google(filename)

        # ------------ Get merchant and address from text, and logo -----------
        ret_logo_key_json = self.google_ocr.get_json_google(filename, detection_type='logo')
        if ret_logo_key_json is not None:
            ret_logo_key = ret_logo_key_json[0]['description']
        else:
            ret_logo_key = ''

        ret_address = self.get_address_string(ocr_json)
        if ret_address is None:
            return ['', '', ret_logo_key, '', '']
        else:
            [ret_name, ret_address] = ret_address

        # --------- extract the merchant info using title and address ---------
        ret_merchant_key = ret_name + ' ' + ret_address
        ret_merchant = self.get_request_google(ret_merchant_key)

        if ret_merchant is not None:
            ret_merchant_json = json.loads(ret_merchant)
            if ret_merchant_json['status'] == 'OK':
                ret_result = ret_merchant_json['results'][0]
                return [ret_result['formatted_address'],
                        ret_result['icon'],
                        ret_result['name'],
                        ret_result['types'],
                        ret_result['place_id']]

        # -------- extract the merchant info using logo + address -------------
        ret_address_info = self.get_request_google(ret_address)

        if ret_address_info is not None:
            ret_address_info_json = json.loads(ret_address_info)
            if ret_address_info_json['status'] == 'OK':
                ret_result = ret_address_info_json['results'][0]
                if len(ret_result['types']) > 1:  # except type=['street']
                    return [ret_result['formatted_address'],
                            ret_result['icon'],
                            ret_result['name'],
                            ret_result['types'],
                            ret_result['place_id']]

        if ret_logo_key != '':
            ret_full_address_info = self.get_request_google(ret_logo_key + ' ' + ret_address)

            if ret_full_address_info is not None:
                ret_address_info_json = json.loads(ret_full_address_info)
                if ret_address_info_json['status'] == 'OK':
                    ret_result = ret_address_info_json['results'][0]
                    return [ret_result['formatted_address'],
                            ret_result['icon'],
                            ret_result['name'],
                            ret_result['types'],
                            ret_result['place_id']]

        # --------- extract the merchant info using only address --------------
        # get position list of address
        if ret_logo_key == '':
            ret_logo_key = ret_name

        list_location = []
        list_place_id = []
        if ret_address_info is not None:
            ret_address_json = json.loads(ret_address_info)
            if ret_address_json['status'] == 'OK':
                for i in range(len(ret_address_json['results'])):
                    list_location.append(ret_address_json['results'][i]['geometry']['location'])
                    list_place_id.append(ret_address_json['results'][i]['place_id'])

        if not list_location:
            return [ret_address, '', ret_logo_key, '', '']

        # get place details using place id
        max_score = 0
        for i in range(len(list_place_id)):
            ret_place = self.get_request_google_place_detail(list_place_id[i])
            ret_place_json = json.loads(ret_place)
            ret_place_address = ret_place_json["result"]["formatted_address"]
            score = fuzz.ratio(ret_address.lower(), ret_place_address.lower())
            max_score = max(max_score, score)

        if len(list_place_id) > 0 and max_score < 70:
            return ['', '', ret_logo_key, '', '']

        # get all buildings around position
        list_building_info = []
        for i in range(len(list_location)):
            ret_buildings = self.get_request_google_nearby([list_location[i]['lat'], list_location[i]['lng']])
            if ret_buildings is not None:
                ret_building_json = json.loads(ret_buildings)
                if ret_building_json['status'] == 'OK':
                    for j in range(len(ret_building_json['results'])):
                        list_building_info.append(ret_building_json['results'][j])

        # get best match result
        if not list_building_info:
            return [ret_address, '', ret_logo_key, '', '']

        max_score = 0
        max_building = None
        for i in range(len(list_building_info)):
            score = fuzz.ratio(ret_name.lower(), list_building_info[i]['name'].lower())
            if score >= max_score:
                max_score = score
                max_building = list_building_info[i]

        if max_score > 50:
            return [max_building['vicinity'],
                    max_building['icon'],
                    max_building['name'],
                    max_building['types'],
                    max_building['place_id']]
        else:
            return [ret_address, '', ret_logo_key, '', '']
Exemplo n.º 15
0
class ReceiptExtractor:
    def __init__(self):
        self.class_func = FuncMl()
        self.class_receipt_merchant = ReceiptMerchantExtractor()
        self.class_receipt_amount = ReceiptAmountExtractor()
        self.class_field_extractor = FieldExtractor()

        self.card_type = self.class_func.load_json(
            os.path.join(self.class_func.my_dir, 'config', 'card_type.json'))

        self.prev_ocr = None
        self.prev_name = None
        self.prev_type = None
        self.prev_address = None
        self.prev_remote_id = None

    def merchant_extract(self, img_file, ocr_json, ret_type):
        if self.prev_ocr == ocr_json:
            ret_address = self.prev_address
            ret_name = self.prev_name
            ret_type_list = self.prev_type
            ret_remote_id = self.prev_remote_id
        else:
            ret_merchant = self.class_receipt_merchant.extract(
                filename=img_file, ocr_json=ocr_json)
            [ret_address, _, ret_name, ret_type_list,
             ret_remote_id] = ret_merchant
            if len(ret_name) > 50:
                ret_name = None
            self.prev_ocr = ocr_json
            self.prev_address = ret_address
            self.prev_remote_id = ret_remote_id
            self.prev_name = ret_name
            self.prev_type = ret_type_list

        if ret_type == 'address':
            return ret_address
        elif ret_type == 'name':
            return ret_name
        elif ret_type == 'type_list':
            return ret_type_list
        elif ret_type == 'remote_id':
            return ret_remote_id
        else:
            return None

    def extract_address(self, filename, ocr_json=None):
        img_list, temp_list, json_list = self.class_func.get_img_list(filename)

        if ocr_json is None:
            if json_list and json_list[0] is not None:
                ocr_json = self.class_func.merge_ocr_json(json_list)
            else:
                ocr_json = self.class_func.get_json_google_from_jpg(
                    img_list[0])

        ret = self.merchant_extract(img_list[0], ocr_json, 'address')

        for temp_file in temp_list:
            self.class_func.rm_file(temp_file)

        return ret

    def extract_remote_id(self, filename, ocr_json=None):
        img_list, temp_list, json_list = self.class_func.get_img_list(filename)

        if ocr_json is None:
            if json_list and json_list[0] is not None:
                ocr_json = self.class_func.merge_ocr_json(json_list)
            else:
                ocr_json = self.class_func.get_json_google_from_jpg(
                    img_list[0])

        ret = self.merchant_extract(img_list[0], ocr_json, 'remote_id')

        for temp_file in temp_list:
            self.class_func.rm_file(temp_file)

        return ret

    def extract_remote_categories(self, filename, ocr_json=None):
        img_list, temp_list, json_list = self.class_func.get_img_list(filename)

        if ocr_json is None:
            if json_list and json_list[0] is not None:
                ocr_json = self.class_func.merge_ocr_json(json_list)
            else:
                ocr_json = self.class_func.get_json_google_from_jpg(
                    img_list[0])

        ret = self.merchant_extract(img_list[0], ocr_json, 'type_list')

        for temp_file in temp_list:
            self.class_func.rm_file(temp_file)

        return ret

    def get_card_info(self, ocr_json):
        card_type = None
        card_pos_list = []

        for i in range(1, len(ocr_json) - 1):
            text1 = ocr_json[i]['description']
            text2 = ocr_json[i]['description'] + ' ' + ocr_json[
                i + 1]['description']

            for card in self.card_type:
                if text1.upper() in self.card_type[card]:
                    card_pos_list.append(i + 1)
                    card_type = card
                    break
                elif text2.upper() in self.card_type[card]:
                    card_pos_list.append(i + 2)
                    card_type = card
                    break

        return card_type, card_pos_list

    def extract_card_type(self, filename, ocr_json=None):
        # ------------------ Get ocr data and remove temp file ----------------
        img_list, temp_list, json_list = self.class_func.get_img_list(filename)

        if ocr_json is None:
            if json_list and json_list[0] is not None:
                ocr_json = self.class_func.merge_ocr_json(json_list)
            else:
                ocr_json = self.class_func.get_json_google_from_jpg(
                    img_list[0])

        for temp_file in temp_list:
            self.class_func.rm_file(temp_file)

        # ------------------------Extract the card type ------------------------
        card_type, _ = self.get_card_info(ocr_json)

        return card_type

    def extract_card_number(self, filename, ocr_json=None):
        # ------------------ Get ocr data and remove temp file ----------------
        img_list, temp_list, json_list = self.class_func.get_img_list(filename)

        if ocr_json is None:
            if json_list and json_list[0] is not None:
                ocr_json = self.class_func.merge_ocr_json(json_list)
            else:
                ocr_json = self.class_func.get_json_google_from_jpg(
                    img_list[0])

        for temp_file in temp_list:
            self.class_func.rm_file(temp_file)

        # --------------------- Extract the Card Number -----------------------
        for i in range(1, len(ocr_json)):
            text = ocr_json[i]['description']

            if len(text) > 11 and text[-4:].isdigit():
                text_front = ''.join(set(text[1:-5]))

                if text_front == 'X' or text_front == 'x' or text_front == '*':
                    return text[-4:]

            if i > 10 and len(
                    text) == 4 and text.isdigit():  # '* * * * * 4075'
                f_num = True
                for j in range(i - 10, i):
                    if ocr_json[j]['description'] != '*':
                        f_num = False

                if f_num:
                    return text

            if i > 3 and len(
                    text) == 4 and text.isdigit():  # 'XXXX XXXX XXXX 4075'
                f_num = True
                for j in range(i - 3, i):
                    if ocr_json[j]['description'] != 'XXXX':
                        f_num = False

                if f_num:
                    return text

            if i > 3 and len(text) == 4 and text.isdigit():  # 'ending in 4075'
                if ocr_json[i - 2]['description'] == 'ending' and ocr_json[
                        i - 1]['description'].lower() == 'in':
                    return text

        # ---------------- Special case - 'Visa 2345 (Swipe)'------------------
        card_type, card_pos_list = self.get_card_info(ocr_json)

        if card_type is None:
            return None

        for card_pos in card_pos_list:
            # Check text for card number
            if len(ocr_json[card_pos]['description']
                   ) == 4 and ocr_json[card_pos]['description'].isdigit():
                # Check region
                rect1 = self.class_func.get_rect_ocr_data(
                    ocr_json, card_pos - 1)
                rect2 = self.class_func.get_rect_ocr_data(ocr_json, card_pos)

                if rect1[0] + rect2[0] < 2 * rect1[2] and abs(
                        rect1[1] - rect2[1]) < int((rect1[3] - rect1[1]) / 2):
                    return ocr_json[card_pos]['description']

        return None

    def extract_field(self, filename, field_id, ocr_json=None):
        img_list, temp_list, json_list = self.class_func.get_img_list(filename)

        if ocr_json is None:
            if json_list and json_list[0] is not None:
                ocr_json = self.class_func.merge_ocr_json(json_list)
            else:
                ocr_json = self.class_func.get_json_google_from_jpg(
                    img_list[0])

        ret = None

        if field_id == constant.RECEIPT_MERCHANT_ID:
            ret = self.merchant_extract(img_list[0], ocr_json, 'name')

        elif field_id == constant.RECEIPT_AMOUNT_ID:
            ret = self.class_receipt_amount.extract(img_list[0], ocr_json)

        elif field_id == constant.RECEIPT_TAX_ID:
            hint_list = [[[
                "proximity", {
                    "type": "same_line_prefix",
                    "text": "SALES TAX"
                }
            ]], [
                ["proximity", {
                    "type": "same_line_suffix",
                    "text": "TAX CA"
                }]
            ], [["proximity", {
                "type": "same_line_prefix",
                "text": "TAX DUE"
            }]], [["proximity", {
                "type": "same_line_prefix",
                "text": "Tax"
            }]]]

            profile_hash = {u'width': 8.5, u'height': 11}
            field_hint = {"data_type": 'currency', "hints": hint_list}
            ret = self.class_field_extractor.extract_v2(img_list[0],
                                                        profile_hash,
                                                        field_hint,
                                                        en_fuzzy=False,
                                                        ocr_json_data=ocr_json)
            ret = ret[0]['value']

        elif field_id == constant.RECEIPT_DATE_ID:
            hint_list = [[[
                "proximity", {
                    "type": "same_line_prefix",
                    "text": "Credit Purchase"
                }
            ]], []]

            profile_hash = {u'width': 8.5, u'height': 11}
            field_hint = {"data_type": 'date', "hints": hint_list}
            ret = self.class_field_extractor.extract_v2(img_list[0],
                                                        profile_hash,
                                                        field_hint,
                                                        en_fuzzy=False,
                                                        ocr_json_data=ocr_json,
                                                        select_first=True)
            ret = ret[0]['value']

        for temp_file in temp_list:
            self.class_func.rm_file(temp_file)

        return {"field_id": field_id, "value": ret}

    def extract(self, filename, ocr_json=None):
        img_list, temp_list, json_list = self.class_func.get_img_list(filename)

        if ocr_json is None:
            if json_list and json_list[0] is not None:
                ocr_json = self.class_func.merge_ocr_json(json_list)
            else:
                ocr_json = self.class_func.get_json_google_from_jpg(
                    img_list[0])

        ret_extract = []
        for i in range(6):
            ret_field = self.extract_field(img_list, i + 1, ocr_json)
            ret_extract.append(ret_field)

        for temp_file in temp_list:
            self.class_func.rm_file(temp_file)

        return ret_extract

    def extract_all_info(self, filename):
        img_list, temp_list, json_list = self.class_func.get_img_list(filename)

        if json_list and json_list[0] is not None:
            ocr_json = self.class_func.merge_ocr_json(json_list)
        else:
            ocr_json = self.class_func.get_json_google_from_jpg(img_list[0])

        ret_fields = self.extract(img_list[0], ocr_json)
        ret_address = self.extract_address(img_list[0], ocr_json)
        ret_card_type = self.extract_card_type(img_list[0], ocr_json)
        ret_card_number = self.extract_card_number(img_list[0], ocr_json)
        ret_remote_id = self.extract_remote_id(img_list[0], ocr_json)
        ret_remote_categories = self.extract_remote_categories(
            img_list[0], ocr_json)

        for temp_file in temp_list:
            self.class_func.rm_file(temp_file)

        ret = {
            'date': ret_fields[0]['value'],
            'amount': ret_fields[1]['value'],
            'merchant': ret_fields[4]['value'],
            # 'tax': ret_fields[5]['value'],
            'address': ret_address,
            # 'card_type': ret_card_type,
            # 'card_number': ret_card_number,
            # 'remote_id': ret_remote_id,
            # 'remote_categories': ret_remote_categories
        }

        return ret
Exemplo n.º 16
0
 def __init__(self):
     self.google_ocr = GoogleOCR()
     self.class_func = FuncMl()
     self.class_date_extractor = DateExtractor()
     self.class_address_extractor = ReceiptAddressExtractorML()
Exemplo n.º 17
0
class VendorExtractor:
    def __init__(self, mode='vendor', country='US'):
        self.class_func = FuncMl()
        self.class_field_extractor = FieldExtractor()
        self.class_text_extractor = TextExtractor()

        self.country = country
        self.mode = mode

        self.vendor_profile_list = self.class_func.load_vendor_profile(mode)
        if mode == 'vendor':
            self.vendor_profile = self.vendor_profile_list['VENDOR1']
        elif mode == 'passport':
            self.vendor_profile = self.vendor_profile_list['PASSPORT1']

    def extract(self, filename, field_id=None, ocr_json=None):
        img_list, temp_list, json_list = self.class_func.get_img_list(filename)

        if ocr_json is None:
            if json_list and json_list[0] is not None:
                ocr_json = self.class_func.merge_ocr_json(json_list)
            else:
                ocr_json = self.class_func.get_json_google_from_jpg(
                    img_list[0])

        result = []

        if ocr_json is None:
            return result
        else:
            ret_json = self.class_func.get_line_rect(ocr_json)
            ocr_text_lines, ocr_rect_lines = ret_json['text'], ret_json['rect']

        for hint_hash in self.vendor_profile['fields']:
            if field_id is None or hint_hash['field_id'] == field_id:
                if 'key_type' in hint_hash:
                    key_type = hint_hash['key_type']
                else:
                    key_type = ''

                ret_val_text = self.class_text_extractor.extract_v1(
                    ocr_text_lines, ocr_rect_lines, hint_hash["data_type"],
                    hint_hash["keys"], key_type)

                if ret_val_text is None:
                    ret_field = None
                    ret_field = self.class_field_extractor.extract_v2(
                        img_list[0],
                        self.vendor_profile, [hint_hash],
                        ocr_json_data=ocr_json,
                        en_fuzzy=False,
                        country=self.country)[0]
                else:
                    ret_field = {
                        'field_id': hint_hash["field_id"],
                        'field_name': hint_hash['name'],
                        'value': ret_val_text
                    }

                result.append(ret_field)

        if self.mode == 'vendor':
            ret_section = []
            for i in range(len(ocr_text_lines)):
                text_line = ocr_text_lines[i]
                if len(text_line) > 6 and text_line[:6].isdigit(
                ) and text_line[6] == ':' and text_line[:6] not in ret_section:
                    ret_section.append(text_line[:6])

            ret_field = {
                'field_id': 0,
                'field_name': 'Section',
                'value': ret_section
            }
            result.append(ret_field)

        for temp_file in temp_list:
            self.class_func.rm_file(temp_file)

        return result

    def get_date_mrp(self, text, mode='birth'):
        yy = text[:2]
        mm = text[2:4]
        dd = text[4:]

        if mode == 'expire':
            yy = '20' + yy
        elif mode == 'birth':
            cur_y = str(datetime.today().year)[2:]
            if int(yy) < int(cur_y):
                yy = '20' + yy
            else:
                yy = '19' + yy

        return yy + '-' + mm + '-' + dd

    def extract_MRP(self, filename, ocr_json=None, mode='passport'):
        img_list, temp_list, json_list = self.class_func.get_img_list(filename)

        if ocr_json is None:
            if json_list and json_list[0] is not None:
                ocr_json = self.class_func.merge_ocr_json(json_list)
            else:
                ocr_json = self.class_func.get_json_google_from_jpg(
                    img_list[0])

        for temp_file in temp_list:
            self.class_func.rm_file(temp_file)

        result = []

        if ocr_json is None:
            return result
        else:
            ret_json = self.class_func.get_line_rect(ocr_json)
            ocr_text_lines, ocr_rect_lines = ret_json['text'], ret_json['rect']

        line1 = ''
        line2 = ''
        for i in range(len(ocr_text_lines) - 1):
            if ocr_text_lines[i].count('<') > 4 and ocr_text_lines[
                    i + 1].count('<') > 4:
                line1 = ocr_text_lines[i]
                line2 = ocr_text_lines[i + 1]
                break

        if line1 == '':
            return result

        line1 = line1.replace(' ', '')
        line2 = line2.replace(' ', '')

        if len(line1) != 44 or len(line2) != 44:
            return result

        # print line1
        # print line2

        if mode == 'passport':
            if line1[0] != 'P':
                if line2[0] == 'P':
                    line1, line2 = line2, line1
                else:
                    return result

        ret_type = line1[1]
        ret_country = line1[2:5]
        ret_name = line1[5:]

        if ret_type != '<':
            result.append({'field_name': 'Type', 'value': ret_type})

        result.append({'field_name': 'Country', 'value': ret_country})

        ret_name = ret_name.replace('<', ' ').strip()
        if ret_name.__contains__('  '):
            result.append({
                'field_name': 'Surname',
                'value': ret_name.split('  ')[0]
            })
            result.append({
                'field_name': 'Given name',
                'value': ret_name.split('  ')[1]
            })
        else:
            result.append({'field_name': 'Full Name', 'value': ret_name})

        ret_no = line2[:9]
        ret_national = line2[10:13]
        ret_date_birth = line2[13:19]
        ret_sex = line2[20]
        ret_date_expire = line2[21:27]
        ret_personal = line2[28:42]
        # print ret_no
        ret_no = ret_no.replace('<', '')
        result.append({'field_name': 'Passport Number', 'value': ret_no})

        # print ret_national
        ret_national = ret_national.replace('<', '')
        if ret_national != '':
            result.append({'field_name': 'Nationality', 'value': ret_national})

        result.append({
            'field_name': 'Date of Birth',
            'value': self.get_date_mrp(ret_date_birth, 'birth')
        })
        result.append({
            'field_name': 'Date of Expire',
            'value': self.get_date_mrp(ret_date_expire, 'expire')
        })

        if ret_sex != '<':
            result.append({'field_name': 'Sex', 'value': ret_sex})

        ret_personal = ret_personal.replace('<', '')
        result.append({'field_name': 'Personal Number', 'value': ret_personal})

        return result