def __init__(self): self.class_func = FuncMl() self.list_state_abbr = self.get_state_abbreviation_us() self.x_list = [] self.y_list = []
def __init__(self): """ class initial function """ self.class_func = FuncMl() self.fix_json = self.class_func.load_json( os.path.join(self.class_func.my_dir, 'config/currency_fix.json'))
def __init__(self, mode='vendor', country='US'): self.class_func = FuncMl() self.class_field_extractor = FieldExtractor() self.class_text_extractor = TextExtractor() self.country = country self.mode = mode self.vendor_profile_list = self.class_func.load_vendor_profile(mode) if mode == 'vendor': self.vendor_profile = self.vendor_profile_list['VENDOR1'] elif mode == 'passport': self.vendor_profile = self.vendor_profile_list['PASSPORT1']
def __init__(self): self.class_func = FuncMl() self.class_receipt_merchant = ReceiptMerchantExtractor() self.class_receipt_amount = ReceiptAmountExtractor() self.class_field_extractor = FieldExtractor() self.card_type = self.class_func.load_json( os.path.join(self.class_func.my_dir, 'config', 'card_type.json')) self.prev_ocr = None self.prev_name = None self.prev_type = None self.prev_address = None self.prev_remote_id = None
def __init__(self): """ class initial function """ self.class_func = FuncMl()
class Proximity: def __init__(self): """ class initial function """ self.class_func = FuncMl() def get_position(self, dst_word, ocr_json, data_type, en_fuzzy=True): pos_list = [] pos_list_temp = [] if dst_word[0] == '(': dst_word = '( ' + dst_word[1:] if dst_word[-1] == ':': dst_word = dst_word[:-1] + ' :' elif dst_word[-1] == ')': dst_word = dst_word[:-1] + ' )' elif dst_word[-1] == '#': dst_word = dst_word[:-1] + ' #' elif dst_word[-1] == '.': dst_word = dst_word[:-1] dst_word_list = dst_word.split() # ---------------- check strict AND logic of dst words ------------------ for i in range(1, len(ocr_json)-len(dst_word_list)+1): if i < len(ocr_json) - 1: comb_text = ocr_json[i]['description'].lower() + ' ' + ocr_json[i + 1]['description'].lower() if constant.no_hint_list.__contains__(comb_text): continue f_match = True for j in range(len(dst_word_list)): if not ocr_json[i+j]['description'].lower() == dst_word_list[j].lower(): f_match = False break if f_match: rect = self.get_rect(ocr_json[i]['boundingPoly']['vertices']) for j in range(1, len(dst_word_list)): rect_next = self.get_rect(ocr_json[i+j]['boundingPoly']['vertices']) rect = self.class_func.merge_rect(rect, rect_next) # check existence of next word rect_last = self.get_rect(ocr_json[i + len(dst_word_list) - 1]['boundingPoly']['vertices']) if len(ocr_json) > i + len(dst_word_list): rect_last_next = self.get_rect(ocr_json[i + len(dst_word_list)]['boundingPoly']['vertices']) word_last_next = ocr_json[i + len(dst_word_list)]['description'] if abs(rect_last_next[0] - rect_last[2]) < (rect[2] - rect[0]) / len(dst_word) * 2 and \ abs(rect_last_next[1] - rect_last[1]) < 10 and \ data_type == 'currency' and word_last_next.isalpha(): pos_list_temp.append(rect) else: pos_list.append(rect) else: pos_list.append(rect) if not pos_list and pos_list_temp: pos_list = pos_list_temp # ---------------- check smooth AND logic of dst words ------------------ if not pos_list: for i in range(1, len(ocr_json)-len(dst_word_list)): comb_text = ocr_json[i]['description'].lower() + ' ' + ocr_json[i + 1]['description'].lower() if constant.no_hint_list.__contains__(comb_text): continue f_match = True for j in range(len(dst_word_list)): if not ocr_json[i+j]['description'].lower().__contains__(dst_word_list[j].lower()): f_match = False break if f_match: rect = self.get_rect(ocr_json[i]['boundingPoly']['vertices']) for j in range(1, len(dst_word_list)): rect_next = self.get_rect(ocr_json[i+j]['boundingPoly']['vertices']) rect = self.class_func.merge_rect(rect, rect_next) pos_list.append(rect) # -------------------- check fuzzy logic case ------------------------ if en_fuzzy and not pos_list: max_match = 0 for k in range(5): for i in range(1, len(ocr_json)-k-1): comb_text = ocr_json[i]['description'].lower() + ' ' + ocr_json[i + 1]['description'].lower() if constant.no_hint_list.__contains__(comb_text): continue text_comb = '' for j in range(k): text_comb += ocr_json[i+j]['description'] if j == 0: rect = self.get_rect(ocr_json[i]['boundingPoly']['vertices']) else: rect_next = self.get_rect(ocr_json[i+j]['boundingPoly']['vertices']) rect = self.class_func.merge_rect(rect, rect_next) match = fuzz.ratio(text_comb.lower(), dst_word.lower()) if max_match < match: max_match = match max_pos = rect if max_match > 82: pos_list.append(max_pos) if pos_list: return pos_list else: return None @staticmethod def get_rect(parent): if 'x' in parent[0]: x1 = parent[0]['x'] else: x1 = 0 if 'y' in parent[0]: y1 = parent[0]['y'] else: y1 = 0 if 'x' in parent[2]: x2 = parent[2]['x'] else: x2 = 0 if 'y' in parent[2]: y2 = parent[2]['y'] else: y2 = 0 return [x1, y1, x2, y2] @staticmethod def expand_region(old_region, proximity_type, margin, rx, ry): [x1, y1, x2, y2] = old_region if proximity_type == 'above_text': nx1 = x1 - int(margin * (x2 - x1)) nx2 = x2 + max(int(margin * (x2 - x1)), 130) ny1 = y2 + ry / 30 # for 5 pixel ny2 = 10000 elif proximity_type == 'below_text': nx1 = x1 - int(margin * (x2 - x1)) nx2 = x2 + int(margin * (x2 - x1)) ny1 = 1 ny2 = y1 - ry / 30 elif proximity_type == 'same_line_prefix': nx1 = x2 + rx / 30 nx2 = 10000 ny1 = y1 - int(margin * (y2 - y1)) ny2 = y2 + int(margin * (y2 - y1)) else: nx1 = 1 nx2 = x1 - rx / 30 ny1 = y1 - int(margin * (y2 - y1)) ny2 = y2 + int(margin * (y2 - y1)) return [nx1, ny1, nx2, ny2] def get_data(self, ocr_json, proximity, data_type, rate_x, rate_y, en_fuzzy=True): hint_pos_list = [] key_pos = [] for i in range(len(proximity)): # ---------------------- Get the [proximity][text] region --------------------------- data_type_pos = self.get_position(proximity[i]['text'], ocr_json, data_type, en_fuzzy=en_fuzzy) if data_type_pos is None: continue # --------------- Expand the above region using [proximity][type] -------------------- new_region1 = [] new_region2 = [] hint_pos1 = [] hint_pos2 = [] for j in range(len(data_type_pos)): # check if other words are in left and right margin of hint pos, and store it hint_pos1 and 2 proximity_type = proximity[i]['type'] hint_rect = data_type_pos[j] if proximity_type == 'same_line_suffix': margin_rect = [hint_rect[2] + 2, hint_rect[1] + 2, hint_rect[2] + int(rate_x/5), hint_rect[3] - 2] else: margin_rect = [hint_rect[0] - int(rate_x/4), hint_rect[1] + 2, hint_rect[0] - 2, hint_rect[3] - 2] f_overlap = False for k in range(1, len(ocr_json)): if self.class_func.check_overlap_rect(margin_rect, self.class_func.get_rect_ocr_data(ocr_json, k)): f_overlap = True break if f_overlap: hint_pos2.append([proximity[i]['type'], hint_rect]) else: hint_pos1.append([proximity[i]['type'], hint_rect]) # expand region of hint pos if proximity_type == 'above_text' or proximity_type == 'below_text': margin = 1.2 else: margin = 0.0 region = self.expand_region(hint_rect, proximity_type, margin, rate_x, rate_y) if f_overlap: new_region2.append(region) else: new_region1.append(region) if len(hint_pos1) > 0: hint_pos_list.append(hint_pos1) else: hint_pos_list.append(hint_pos2) if len(new_region1) > 0: key_pos.append(new_region1) else: key_pos.append(new_region2) if len(key_pos) == 0: return '', '', '', '' # ----------------- Get new matching result from expanded region --------------------- new_text = '' new_text_list = [] new_text_pos = [] rect_prev = None for i in range(1, len(ocr_json)): pos = self.get_rect(ocr_json[i]['boundingPoly']['vertices']) if len(key_pos) == 1: # ---------- in case of proximity hint is 1 f_match = False for j in range(len(key_pos[0])): if self.class_func.check_overlap_rect(key_pos[0][j], pos): f_match = True break elif len(key_pos) == 2: # ---------- in case of proximity hint is more tan 2 f_match = False for k1, k2 in itertools.product(range(len(key_pos[0])), range(len(key_pos[1]))): if self.class_func.check_overlap_rect(key_pos[0][k1], pos) and \ self.class_func.check_overlap_rect(key_pos[1][k2], pos): f_match = True break elif len(key_pos) == 3: # ---------- in case of proximity hint is more tan 3 f_match = False for k1, k2, k3 in itertools.product(range(len(key_pos[0])), range(len(key_pos[1])), range(len(key_pos[2]))): if self.class_func.check_overlap_rect(key_pos[0][k1], pos) and \ self.class_func.check_overlap_rect(key_pos[1][k2], pos) and \ self.class_func.check_overlap_rect(key_pos[2][k3], pos): f_match = True break elif len(key_pos) == 4: # ---------- in case of proximity hint is more tan 4 f_match = False for k1, k2, k3, k4 in itertools.product(range(len(key_pos[0])), range(len(key_pos[1])), range(len(key_pos[2])), range(len(key_pos[3]))): if self.class_func.check_overlap_rect(key_pos[0][k1], pos) and \ self.class_func.check_overlap_rect(key_pos[1][k2], pos) and \ self.class_func.check_overlap_rect(key_pos[2][k3], pos) and \ self.class_func.check_overlap_rect(key_pos[3][k4], pos): f_match = True break else: f_match = False if f_match: text_item = ocr_json[i]['description'] text_item_prev = ocr_json[max(i - 1, 0)]['description'] rect = ocr_json[i]['boundingPoly']['vertices'] x1 = self.class_func.get_field_int(rect[0], 'x') y1 = self.class_func.get_field_int(rect[0], 'y') if rect_prev is None: x2 = 0 y2 = 0 else: x2 = self.class_func.get_field_int(rect_prev[1], 'x') y2 = self.class_func.get_field_int(rect_prev[0], 'y') rect_prev = rect if abs(x1 - x2) < rate_x / 5 and abs(y1 - y2) < rate_y / 30: if text_item == '/' or (len(new_text) > 0 and new_text[-1] == '/'): new_text += text_item elif text_item == '-' or (len(new_text) > 0 and new_text[-1] == '-'): new_text += text_item elif text_item == '.' and len(text_item_prev) == 1 and text_item_prev.isupper(): # ignore "P ." continue elif text_item == '.' or (len(new_text) > 0 and new_text[-1] == '.'): new_text += text_item elif text_item == ',' or (len(new_text) > 0 and new_text[-1] == ','): new_text += text_item elif len(new_text) > 0 and new_text[-1] == '$': new_text += text_item else: new_text += (' ' + text_item) else: if new_text == '': new_text += text_item else: new_text += ('\n' + text_item) new_text_list.append(text_item) new_text_pos.append(pos) return new_text, hint_pos_list, new_text_list, new_text_pos
class CurrencyExtractor: def __init__(self): """ class initial function """ self.class_func = FuncMl() self.fix_json = self.class_func.load_json( os.path.join(self.class_func.my_dir, 'config/currency_fix.json')) def is_currency(self, text): # case of ocred incorrectly "," into "." such as '$32.452.10', => '$32452.10' # "." into "," such as '$32,45' => '$34.45' # '$164.04.' => '$164.04' # '16.404:12USD' => '$16404.12' # ------------------------ text pre-processing ------------------------ text = text.replace(' ', '') if text[-3:] == 'USD': text = '$' + text[:len(text) - 3].replace(':', '.') elif text[:3] == 'USD': text = '$' + text[3:len(text)] pos_dot = [] for i in range(len(text) - 1): if text[i] == '.' or text[i] == ',': pos_dot.append(i) if len(pos_dot) > 0: if len(text[pos_dot[-1] + 1:]) != 3 and text[pos_dot[-1]] == ',': char_last = '.' else: char_last = text[pos_dot[-1]] if char_last == '.': text = text[:pos_dot[-1]].replace(',', '').replace( '.', '') + char_last + text[pos_dot[-1] + 1:] else: text = text.replace(',', '').replace('.', '') # ----------------- Get valid text using '$' character ---------------- # ab$13.2 -> $13.2, -$7.6 -> -$7.6 # --------------------------------------------------------------------- if text.__contains__('$'): dollar_pos = text.find('$') if dollar_pos > 0 and text[dollar_pos - 1] == '-': text = text[dollar_pos - 1:] else: text = text[dollar_pos:] if text == '': return None # ------------------ Decide of positive or negative ------------------- f_pos = False if len(text) > 2: if text[0] == '(' and text[-1] == ')': currency_data = text[1:-1] elif text[1] == '(' and text[-1] == ')': currency_data = text[0] + text[2:-1] elif text[-2:].upper() == 'CR': currency_data = text[:-2] elif text[0] == '-': currency_data = text[1:] elif text[-1] == '-': currency_data = text[:-1] else: currency_data = text f_pos = True else: currency_data = text f_pos = True # -------------------- Check existence dollar mark -------------------- f_dollar = True if currency_data[0] == '$': currency_data = currency_data[1:] for i in range(len(currency_data)): if not currency_data[-1].isdigit(): currency_data = currency_data[:-1] else: break elif currency_data[0] == 'S': currency_data = currency_data[1:] else: f_dollar = False # ------------------------- Convert to value -------------------------- try: # some error correction ('99.G8 -> 99.08') if currency_data.__contains__('.'): for fix_key in self.fix_json: if currency_data.count(str(fix_key)) == 1: currency_data = currency_data.replace( str(fix_key), str(self.fix_json[fix_key])) ret = round(float(currency_data), 2) if not f_pos: ret = -ret ret = '{:.2f}'.format(ret) return [f_dollar, ret] except ValueError: return None def extract(self, text): text_list = text.split('\n') result_value = [] result_value_dollar = [] for i in range(len(text_list)): # ---------------- for single line -------------------- ret = self.is_currency(text_list[i]) if ret is not None: if ret[0]: result_value_dollar.append(ret[1]) else: result_value.append(ret[1]) continue # ------------------ for single word ------------------- word_list = text_list[i].split() if len(word_list) == 1 or word_list.__contains__( '%'): # ignore `5.4%`, `(7%)` continue for j in range(len(word_list)): ret = self.is_currency(word_list[j]) if ret is not None: if ret[0]: result_value_dollar.append(ret[1]) else: result_value.append(ret[1]) return result_value_dollar + result_value
from Categorization import VendorExtractor from Categorization import FuncMl import sys import os if len(sys.argv) >= 2: src_name = sys.argv[1] else: src_name = 'sample_passport/p2.jpg' class_passport = VendorExtractor('passport') class_func = FuncMl() ret = class_passport.extract_MRP(src_name) if not ret: ret = class_passport.extract(src_name) # ret = class_passport.extract(src_name, 9) ret_parse = {} for i in range(len(ret)): ret_parse[ret[i]['field_name']] = ret[i]['value'] if ret_parse['Date of Birth'] == ret_parse['Date of Issue']: ret_parse['Date of Birth'] = None # --------------- Save and Display Result ------------------- str_path = os.path.split(src_name) data = [['Name', 'Value']] for key in sorted(ret_parse.iterkeys()):
def __init__(self, country='US'): self.class_func = FuncMl() self.currency_extractor = CurrencyExtractor() self.date_extractor = DateExtractor() self.country = country
class ReceiptAddressExtractorML: def __init__(self): self.class_func = FuncMl() self.list_state_abbr = self.get_state_abbreviation_us() self.x_list = [] self.y_list = [] def get_state_abbreviation_us(self): us_postal_data = self.class_func.load_csv( os.path.join(self.class_func.my_dir, 'config/us_postal_codes.csv')) list_state_abbr = [] for i in range(1, len(us_postal_data)): list_state_abbr.append(us_postal_data[i][3].lower()) list_state_abbr.append('california') return self.class_func.remove_duplicate(list_state_abbr) def __mark_address_line(self, text_lines): """ Check and mark the address lines """ mark_list = [] key_address = [ 'street', 'road', 'rd', 'ave', 'st', 'way', 'city', 'dr', 'ste', 'floor', 'station', 'airport', 'mall', 'center', 'blvd', 'expressway' ] no_key_list = [':', '%', ' ID.', '.com', 'Visa ending'] # for i in range(min(len(text_lines), 10)): for i in range(1, len(text_lines)): text_line = text_lines[i].lower() word_line = text_line.replace(',', ' ').split() f_digit = False f_key1 = False f_key2 = False # Check no_key f_no = False for no_key in no_key_list: if text_lines[i].__contains__(no_key): f_no = True break if f_no: continue # Check address keys for j in range(len(word_line)): if word_line[j].replace('-', '').isdigit(): f_digit = True if key_address.__contains__( word_line[j].strip('.').strip(',')): f_key1 = True if self.list_state_abbr.__contains__( word_line[j].strip('.').strip(',')): f_key2 = True if f_key1: # `First Street White Luncheon Napkins 1500 ch` for j in range(len(word_line)): if word_line[j].isdigit(): break if j >= 4: f_key1 = False if not f_key2: for k in range(len(self.list_state_abbr)): if len(self.list_state_abbr[k].split()) > 1: if text_line.__contains__( self.list_state_abbr[k].lower()): f_key2 = True if f_key1: mark_list.append(i) elif f_key2: if f_digit: mark_list.append(i) elif i + 1 < len(text_lines) and text_lines[ i + 1].isdigit() and 3 < len(text_lines[i + 1]) < 6: mark_list.append(i) mark_list.append(i + 1) # check previous line if address is 1 line: '1298 Montague Expw \nSan Jose CA 95131' if len(mark_list) == 1: if mark_list[0] > 0: temp_line = text_lines[mark_list[0] - 1] temp_word = temp_line.split() if temp_word[0].isdigit() and len(temp_word) >= 3: mark_list.insert(0, mark_list[0] - 1) return mark_list def __detect_line_item_section(self, ocr_json): # ------------------- Get rect of individual words -------------------- rects = [] total_height = 0 for i in range(1, len(ocr_json)): word_rect = self.class_func.get_rect_ocr_data(ocr_json, i) total_height += (word_rect[3] - word_rect[1]) rects.append(word_rect) # ------------------- Merge rects of top/bottom words ----------------- char_h = int(total_height / len(ocr_json) / 2) while True: f_merge = False for i in range(len(rects) - 20): for j in range(i + 1, i + 20): if abs(rects[i][0] - rects[j][0]) < char_h and abs(rects[i][2] - rects[j][2]) < char_h and \ (abs(rects[i][1] - rects[j][3]) < char_h or abs(rects[i][3] - rects[j][1]) < char_h): rects[i] = self.class_func.merge_rect( rects[i], rects[j]) rects.pop(j) f_merge = True break if f_merge: break if not f_merge: break # ------------------- Detect the long height rects -------------------- long_rect = [] for i in range(len(rects)): if rects[i][3] - rects[i][1] > char_h * 10: long_rect.append(rects[i]) if len(long_rect) > 2: for i in range(len(long_rect)): same_line_cnt = 0 for j in range(len(long_rect)): if abs(long_rect[i][1] - long_rect[j][1]) < char_h: same_line_cnt += 1 if same_line_cnt > 2: line_data = self.class_func.get_line_rect(ocr_json) for j in range(len(line_data['rect'])): if line_data['rect'][j][1] >= long_rect[i][3]: return j - 1 return -1 else: return -1 def get_features_address(self, ocr_json): new_ocr_text = self.class_func.remove_vertical_text(ocr_json) text_lines = new_ocr_text.splitlines() address_line_list = self.__mark_address_line(text_lines) mark_address_list = [] if address_line_list: mark_start = address_line_list[0] mark_len = 1 text_address = text_lines[address_line_list[0]] for i in range(1, len(address_line_list)): if address_line_list[i] == address_line_list[i - 1] + 1: mark_len += 1 text_address += ' ' + text_lines[address_line_list[i]] else: mark_address_list.append( [mark_start, mark_len, text_address]) mark_start = address_line_list[i] mark_len = 1 text_address = text_lines[address_line_list[i]] if 10 < len(text_address) < 60: mark_address_list.append([mark_start, mark_len, text_address]) return mark_address_list def extractor(self, ocr_json): mark_address_list = self.get_features_address(ocr_json) first_address_line = -1 if mark_address_list: first_address_line = mark_address_list[0][0] ret_address = mark_address_list[0][2] else: ret_address = '' return ret_address, first_address_line
class DateExtractor: def __init__(self): """ class initial function """ self.class_func = FuncMl() @staticmethod def get_pre_digit(text, reverse=False): if text == '': return text if reverse: text = text[::-1] if not text.isdigit() and text[0].isdigit(): i = 0 for i in range(len(text)): if not text[i].isdigit(): break text = text[:i] if reverse: text = text[::-1] return text @staticmethod def split_text(text): """ Split the text as digital and chars. for example: '14Jul18' => ['14', 'Jul', '18'] """ char_prev_st = 0 char_list = '' txt_date = [] for i in range(len(text)): if text[i].isdigit(): char_st = 1 elif text[i].isalpha(): char_st = 2 else: char_st = 3 if char_prev_st != char_st and char_list != '': txt_date.append(char_list) char_list = '' char_list += text[i] char_prev_st = char_st txt_date.append(char_list) return txt_date def is_date(self, text, country='US'): # if text == 'Jun16/18': # print "1" month_list = constant.month_list if country == 'NL': month_list_full = constant.month_list_full_NL else: month_list_full = constant.month_list_full score = 10 if text.__contains__('/'): txt_date = text.split('/') elif text.__contains__('-'): txt_date = text.split('-') elif text.__contains__('.') or text.__contains__(','): txt_date = text.replace('.', ' ').replace(',', ' ').split() score -= 1 elif text.__contains__("'"): txt_date = text.split("'") else: txt_date = self.split_text(text) if len(txt_date) == 1: score -= 2 if txt_date[0].isdigit(): if len(text) == 6: txt_date[0] = text[0] txt_date.append(text[1]) txt_date.append(text[2:]) elif len(text) == 7: if text[0] == '0': txt_date[0] = text[0:2] txt_date.append(text[2]) else: txt_date[0] = text[0] txt_date.append(text[1:3]) txt_date.append(text[3:]) elif len(text) == 8: txt_date[0] = text[0:2] txt_date.append(text[2:4]) txt_date.append(text[4:]) elif len(text) == 10: txt_date[0] = text[0:2] txt_date.append(text[3:5]) txt_date.append(text[6:]) else: return None else: if len(text) == 10: # 'DECO1,2017' if text[0:3].isalpha() and text[3:5].replace('O', '0').isdigit() and \ text[6:].isdigit() and text[5] == ',': score -= 1 txt_date[0] = text[0:3] txt_date.append(text[3:5].replace('O', '0')) txt_date.append(text[6:]) else: return None else: return None elif len(txt_date) == 2: score -= 1 temp1 = txt_date[0] temp2 = txt_date[1] if txt_date[0].isdigit(): if len(temp2) <= 4: if len(temp1) == 4: # 0722 => 07, 22 txt_date[0] = temp1[0:2] txt_date[1] = temp1[2:] elif len(temp1) == 3: # 072 => 07, 2 txt_date[0] = temp1[0:2] txt_date[1] = temp1[2] elif len(temp1) == 2: # 72 => 7, 2 txt_date[0] = temp1[0] txt_date[1] = temp1[1] elif len( temp1) == 5 and temp1[2] == '1': # 07l22 => 07, 22 txt_date[0] = temp1[0:2] txt_date[1] = temp1[3:] else: return None txt_date.append(temp2) else: txt_date[0] = temp1 if len(temp2) == 6: # '11 212017' => '11,21,2017' txt_date[1] = temp2[0:2] txt_date.append(temp2[2:]) else: return None elif txt_date[0] != '' and txt_date[0][0].isalpha( ) and txt_date[0][-1].isdigit(): # 'May04' for i in range(len(temp1)): if temp1[i].isdigit(): t1 = temp1[:i].lower() t2 = temp1[i:] if (month_list_full.__contains__(t1) or month_list.__contains__(t1)) and t2.isdigit(): score += 1 txt_date[0] = t1 txt_date[1] = t2 txt_date.append(temp2) break else: return None if len(txt_date) == 2: return None else: return None elif len(txt_date) != 3: return None txt_date[0] = txt_date[0].split(':')[-1] # ----------- collecting digit from mix of digit and string ----------- txt_date[0] = self.get_pre_digit(txt_date[0], reverse=True) txt_date[2] = self.get_pre_digit(txt_date[2]) if txt_date[1] == '1st': txt_date[1] = '1' elif txt_date[1] == '2nd': txt_date[1] = '2' elif txt_date[1] == '3rd': txt_date[1] = '3' elif txt_date[1][:-2].isdigit() and txt_date[1][-2:] == 'th': txt_date[1] = txt_date[1][:-2] # ----------------------------- extract the date -------------------------- data_month = 0 data_day = 0 if len(txt_date[0]) == 4 and len(txt_date[1]) == 2 and len( txt_date[2]) == 2: # case of yyyy/mm/dd if txt_date[0].isdigit() and txt_date[1].isdigit( ) and txt_date[2].isdigit(): data_year = int(txt_date[0]) data_month = int(txt_date[1]) data_day = int(txt_date[2]) if not (1900 < data_year < 2100 and 1 <= data_month <= 12 and 1 <= data_day <= 31): return None else: return None else: # -------------------------- Checking of year ------------------------- if txt_date[2].isdigit(): if 1900 <= int(txt_date[2]) <= 2100: data_year = int(txt_date[2]) elif len(txt_date[2]) == 2 and int(txt_date[2]) < 50: data_year = int(txt_date[2]) + 2000 score -= 1 else: return None elif txt_date[2][:-1].isdigit(): score -= 1 if 1900 <= int(txt_date[2][:-1]) <= 2100: data_year = int(txt_date[2][:-1]) elif len(txt_date[2][:-1]) == 2 and int(txt_date[2][:-1]) < 50: data_year = int(txt_date[2][:-1]) + 2000 score -= 1 else: return None else: return None find_md = True if txt_date[1].isdigit(): # case of m/d/y # -------------------------- Checking of day ------------------------- if 0 < int(txt_date[1]) <= 31: data_day = int(txt_date[1]) else: find_md = False # -------------------------- Checking of month ------------------------- if txt_date[0].isdigit(): if 0 < int(txt_date[0]) <= 12 and find_md: if data_day < 13 and (country == 'INDIA' or country == 'NL'): data_month = data_day data_day = int(txt_date[0]) else: data_month = int(txt_date[0]) else: find_md = False elif month_list.__contains__(txt_date[0].lower()): data_month = month_list.index(txt_date[0].lower()) + 1 score += 1 elif month_list_full.__contains__(txt_date[0].lower()): data_month = month_list_full.index(txt_date[0].lower()) + 1 score += 1 else: find_md = False else: find_md = False if not find_md: # case of d/m/y if txt_date[0].isdigit(): # -------------------------- Checking of day ------------------------- if 0 < int(txt_date[0]) <= 31: data_day = int(txt_date[0]) else: return None # -------------------------- Checking of month ------------------------- if txt_date[1].isdigit() and 0 < int(txt_date[1]) <= 12: data_month = int(txt_date[1]) elif month_list.__contains__(txt_date[1].lower()): data_month = month_list.index(txt_date[1].lower()) + 1 elif month_list_full.__contains__(txt_date[1].lower()): data_month = month_list_full.index( txt_date[1].lower()) + 1 else: return None else: return None if country == 'NL': # ret = '%02d/%02d/%04d' % (data_day, data_month, data_year) ret = '%04d-%02d-%02d' % (data_year, data_month, data_day) else: ret = '%02d/%02d/%04d' % (data_month, data_day, data_year) return [ret, score] def extract(self, text, country='US'): text_line_list = text.replace("'", ' ').splitlines() result_value = [] result_score = [] for comb_lines in range(2): # -------------- First check one lines, and if not detect, check combination of 2 lines ------------ for line_ind in range(len(text_line_list) - comb_lines): if comb_lines == 0: text_list = text_line_list[line_ind].split() elif comb_lines == 1: text_list = text_line_list[line_ind].split( ) + text_line_list[line_ind + 1].split() else: text_list = [] # ---------------- for single word -------------------- for text_item in text_list: ret = self.is_date(text_item, country) if ret is not None: result_value.append(ret[0]) result_score.append(ret[1]) # ---------------- for combination of 2 words('04/20/ 2017') -------------------- for i in range(len(text_list) - 1): ret = self.is_date(text_list[i] + ' ' + text_list[i + 1], country) if ret is not None: result_value.append(ret[0]) result_score.append(ret[1]) # ---------------- for combination of 2 words('Sep 1,2017') -------------------- for i in range(len(text_list) - 1): ret = self.is_date( text_list[i] + '/' + text_list[i + 1].replace(',', '/'), country) if ret is not None: result_value.append(ret[0]) result_score.append(ret[1]) # ----------- for combination of 3 words --------------- for i in range(len(text_list) - 2): new_text = self.class_func.text_clean(text_list[i]) + '/' + \ self.class_func.text_clean(text_list[i+1]) + '/' + \ self.class_func.text_clean(text_list[i+2]) ret = self.is_date(new_text, country) if ret is not None: result_value.append(ret[0]) result_score.append(ret[1]) if len(result_value) > 0: max_score = max(result_score) ret_date = [] for i in range(len(result_score)): if result_score[i] == max_score > 7: ret_date.append(result_value[i]) # error correction if len(ret_date) == 2: # ['6/13/2018', '6/13/2013'] if ret_date[0][:-1] == ret_date[1][:-1]: if ret_date[0][-1] == '8' and ret_date[1][-1] == '3': ret_date = [ret_date[0][:-1] + '8'] return ret_date return []
def __init__(self): self.class_field_extractor = FieldExtractor() self.class_func = FuncMl() self.amount_key_list, self.amount_no_key_list = self.class_func.load_receipt_amt_keys( )
class ReceiptAmountExtractor: def __init__(self): self.class_field_extractor = FieldExtractor() self.class_func = FuncMl() self.amount_key_list, self.amount_no_key_list = self.class_func.load_receipt_amt_keys( ) def extract_receipt_amount_keys(self, filename, ocr_json=None): # ---------------------------- Get OCR json ------------------------------- if ocr_json is None: ocr_json = self.class_func.get_json_google_from_jpg(filename) if ocr_json is None: return [], [] detect_keys = [] key_words = 0 for i in range(1, len(ocr_json) - 2): if ocr_json[i]['description'] == '' or ocr_json[i]['description'][ 0].islower(): continue # ------------- Get Candidate of amount keys and it's position ------------ text1 = ocr_json[i]['description'].lower() text2 = text1 + ' ' + ocr_json[i + 1]['description'].lower() text3 = text2 + ' ' + ocr_json[i + 2]['description'].lower() text_pos1 = self.class_func.get_rect_ocr_data(ocr_json, i) text_pos2 = self.class_func.get_rect_ocr_data(ocr_json, i + 1) text_pos3 = self.class_func.get_rect_ocr_data(ocr_json, i + 2) if key_words > 1: key_words -= 1 continue if text2 in self.amount_no_key_list: key_words = 2 continue elif text3 in self.amount_no_key_list: key_words = 3 continue if text3 in self.amount_key_list and self.class_func.check_same_line(text_pos1, text_pos2) and \ self.class_func.check_same_line(text_pos2, text_pos3): amount_key = text3 key_words = 3 elif text2 in self.amount_key_list and self.class_func.check_same_line( text_pos1, text_pos2): amount_key = text2 key_words = 2 elif text1 in self.amount_key_list: amount_key = text1 key_words = 1 else: continue # ------------------------ Get value of candidates ------------------------ profile_hash = {u'width': 8.5, u'height': 11} key_y1 = text_pos1[1] - 200 key_y2 = text_pos1[3] + 200 hint_hash = { u'field_id': 2, u'data_type': u'currency', u'hints': [[[ u'coordinates', { u'x_1': 0.0, u'x_2': 8.5, u'y_1': key_y1, u'y_2': key_y2 } ], [ u'proximity', { u'text': amount_key, u'type': u'same_line_prefix' } ]]] } ret_val = self.class_field_extractor.extract_v2( filename, profile_hash, hint_hash, ocr_json_data=ocr_json) amount_value = ret_val[0]['value'] if amount_value is None: continue elif amount_value == 0 and amount_key == 'total': continue # ------------- Get number of words on left and right of value ------------ key_x1 = text_pos1[0] if key_words == 3: key_x2 = text_pos3[2] elif key_words == 2: key_x2 = text_pos2[2] else: key_x2 = text_pos1[2] # ----- find all same line words ------ center_line = int((text_pos1[1] + text_pos1[3]) / 2) line_word_text = [] line_word_pos = [] for j in range(1, len(ocr_json) - 2): word_pos = self.class_func.get_rect_ocr_data(ocr_json, j) if word_pos[1] < center_line < word_pos[3]: word_text = ocr_json[j]['description'].lower() line_word_text.append(word_text) line_word_pos.append(word_pos) # ----- get x position of value word ----- value_x1 = 0 value_x2 = 0 for j in range(len(line_word_text)): if line_word_pos[j][0] < key_x2: continue if line_word_text[j].isdigit() and int( line_word_text[j]) == int(amount_value): value_x1 = line_word_pos[j][0] value_x2 = line_word_pos[j][2] if j < len(line_word_pos) - 2 and line_word_text[j + 2].isdigit() and \ (line_word_text[j + 1] == '.' or line_word_text[j + 1] == ',') and \ float(line_word_text[j] + '.' + line_word_text[j + 2]) == float(amount_value): value_x2 = line_word_pos[j + 2][2] break # ----- count the words ----- cnt1 = 0 cnt2 = 0 cnt3 = 0 for j in range(len(line_word_text)): if not (line_word_text[j].isdigit() or line_word_text[j].isalpha()): continue if line_word_pos[j][2] < key_x1: cnt1 += 1 elif key_x2 < line_word_pos[j][0] < value_x1: cnt2 += 1 elif value_x2 < line_word_pos[j][0]: cnt3 += 1 detect_keys.append([ amount_key, amount_value, text_pos1, [cnt1, cnt2, cnt3], [0, 0, 0, 0] ]) # ----------------------- Get relative position of keys ----------------------- for i in range(len(detect_keys) - 1): if self.class_func.check_same_line(detect_keys[i][2], detect_keys[i + 1][2]): detect_keys[i][4][0] = i + 2 detect_keys[i + 1][4][1] = i + 1 else: detect_keys[i][4][2] = i + 2 detect_keys[i + 1][4][3] = i + 1 # ------------------------ Create key-value dictionary ------------------------ dict_val = {} dict_temp = {} for i in range(len(detect_keys)): key_name = detect_keys[i][0] if key_name in dict_val: if dict_temp[key_name] > sum(detect_keys[i][3]): dict_val[key_name] = detect_keys[i][1] dict_temp[key_name] = sum(detect_keys[i][3]) else: dict_val[key_name] = detect_keys[i][1] dict_temp[key_name] = sum(detect_keys[i][3]) # ---------------------------- Create feature list ---------------------------- feature_list = [] for i in range(len(detect_keys)): feature_list.append( [detect_keys[i][0], detect_keys[i][3], detect_keys[i][4]]) return dict_val, feature_list def extract(self, filename, ocr_json=None): key_val_list, key_data = self.extract_receipt_amount_keys( filename, ocr_json) if not key_data: return None for amount_key in self.amount_key_list: if amount_key in key_val_list: return key_val_list[amount_key] return None
class ReceiptMerchantExtractor: def __init__(self): self.google_ocr = GoogleOCR() self.class_func = FuncMl() self.class_date_extractor = DateExtractor() self.class_address_extractor = ReceiptAddressExtractorML() @staticmethod def __mark_merchant_line(text_lines, line_rect_list): """ Check and mark the Merchant lines """ merchant_rect = None name_line_list = [] page_rect = line_rect_list['rect'][0] list_no_name = ['welcome', 'thank you', 'customer', 'copy', 'only', '*', 'ticket', '(', ')', ':', 'invoice', '!', 'more', 'congratulation', 'bill'] for i in range(len(text_lines)): # pre-processing of text line for j in range(i + 1, len(line_rect_list['text'])): if text_lines[i] == line_rect_list['text'][j]: break line_rect = line_rect_list['rect'][j] text_lines[i] = text_lines[i].replace('Welcome to', '') text_lines[i] = text_lines[i].strip('-') # check contains of key in list_no_name f_check_no_list = False for j in range(len(list_no_name)): if text_lines[i].lower().__contains__(list_no_name[j]): f_check_no_list = True break if f_check_no_list: continue # check validation of key if len(text_lines[i]) <= 2: continue elif len(name_line_list) > 0 and name_line_list[-1] + 1 != i: break elif len(name_line_list) > 0 and text_lines[i].__contains__(text_lines[name_line_list[-1]]): continue elif len(name_line_list) > 2: continue elif len(name_line_list) > 1 and not text_lines[i].isupper(): continue elif text_lines[i][0] == '#': continue elif len(CommonRegex(text_lines[i]).dates) > 0: continue elif len(CommonRegex(text_lines[i]).phones) > 0: continue elif len(CommonRegex(text_lines[i]).links) > 0: continue elif len(text_lines[i].replace('@', '').replace('&', '').split()) > 5: continue elif len(text_lines[i].split()) > 3 and text_lines[i].__contains__('-'): continue elif text_lines[i].replace('-', '').replace(' ', '').isdigit(): # '305337 - 1' continue elif len(name_line_list) > 0 and line_rect[1] > 2 * merchant_rect[3] - merchant_rect[1]: continue elif (line_rect[0] + line_rect[2]) > (page_rect[0] + page_rect[2]) * 1.3: # check the position continue name_line_list.append(i) merchant_rect = line_rect return name_line_list def get_address_string(self, ocr_json): # --------------------------- Extract Address ------------------------- ret_address, first_address_line = self.class_address_extractor.extractor(ocr_json) # ------------------------ Remove vertical text ----------------------- line_data = self.class_func.get_line_rect(ocr_json) new_ocr_text = self.class_func.remove_vertical_text(ocr_json) text_lines = new_ocr_text.splitlines() # --------------------------- Extract Merchant ------------------------ if first_address_line == -1: range_merchant = 5 else: range_merchant = first_address_line name_line_list = self.__mark_merchant_line(text_lines[:range_merchant], line_data) ret_name = '' for i in range(len(name_line_list)): ret_name += text_lines[name_line_list[i]] + ' ' if ret_name.__contains__('#') and len(ret_name.split()) > 2: ret_name = ret_name[:ret_name.find('#')] ret_name = ret_name.strip('.').strip().replace(' ', ' ').replace('&', '') if len(ret_name) > 1 and ret_name[0].islower() and ret_name[1:].isupper(): ret_name = ret_name.upper() return [ret_name, ret_address] def get_request_google(self, key_string): """ get the respond from google text search request using key_string. """ if key_string is None: return None response = requests.post(url='https://maps.googleapis.com/maps/api/place/textsearch/json?key=' + self.class_func.google_key + '&query=' + key_string, headers={'Content-Type': 'application/json'}) return response.text def get_request_google_nearby(self, position): """ get the respond from google text search request using key_string. """ url_info = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json?location=' + \ str(position[0]) + ',' + str(position[1]) + '&key=' + self.class_func.google_key + \ '&radius=500' response = requests.post(url=url_info, headers={'Content-Type': 'application/json'}) return response.text def get_request_google_place_detail(self, place_id): """ get the respond from google place details request using key_string. """ url_info = 'https://maps.googleapis.com/maps/api/place/details/json?placeid=' + place_id + \ '&key=' + self.class_func.google_key response = requests.post(url=url_info, headers={'Content-Type': 'application/json'}) return response.text def extract(self, filename=None, ocr_json=None): """ get some google search data from ocr json data. """ if ocr_json is None: ocr_json = self.google_ocr.get_json_google(filename) # ------------ Get merchant and address from text, and logo ----------- ret_logo_key_json = self.google_ocr.get_json_google(filename, detection_type='logo') if ret_logo_key_json is not None: ret_logo_key = ret_logo_key_json[0]['description'] else: ret_logo_key = '' ret_address = self.get_address_string(ocr_json) if ret_address is None: return ['', '', ret_logo_key, '', ''] else: [ret_name, ret_address] = ret_address # --------- extract the merchant info using title and address --------- ret_merchant_key = ret_name + ' ' + ret_address ret_merchant = self.get_request_google(ret_merchant_key) if ret_merchant is not None: ret_merchant_json = json.loads(ret_merchant) if ret_merchant_json['status'] == 'OK': ret_result = ret_merchant_json['results'][0] return [ret_result['formatted_address'], ret_result['icon'], ret_result['name'], ret_result['types'], ret_result['place_id']] # -------- extract the merchant info using logo + address ------------- ret_address_info = self.get_request_google(ret_address) if ret_address_info is not None: ret_address_info_json = json.loads(ret_address_info) if ret_address_info_json['status'] == 'OK': ret_result = ret_address_info_json['results'][0] if len(ret_result['types']) > 1: # except type=['street'] return [ret_result['formatted_address'], ret_result['icon'], ret_result['name'], ret_result['types'], ret_result['place_id']] if ret_logo_key != '': ret_full_address_info = self.get_request_google(ret_logo_key + ' ' + ret_address) if ret_full_address_info is not None: ret_address_info_json = json.loads(ret_full_address_info) if ret_address_info_json['status'] == 'OK': ret_result = ret_address_info_json['results'][0] return [ret_result['formatted_address'], ret_result['icon'], ret_result['name'], ret_result['types'], ret_result['place_id']] # --------- extract the merchant info using only address -------------- # get position list of address if ret_logo_key == '': ret_logo_key = ret_name list_location = [] list_place_id = [] if ret_address_info is not None: ret_address_json = json.loads(ret_address_info) if ret_address_json['status'] == 'OK': for i in range(len(ret_address_json['results'])): list_location.append(ret_address_json['results'][i]['geometry']['location']) list_place_id.append(ret_address_json['results'][i]['place_id']) if not list_location: return [ret_address, '', ret_logo_key, '', ''] # get place details using place id max_score = 0 for i in range(len(list_place_id)): ret_place = self.get_request_google_place_detail(list_place_id[i]) ret_place_json = json.loads(ret_place) ret_place_address = ret_place_json["result"]["formatted_address"] score = fuzz.ratio(ret_address.lower(), ret_place_address.lower()) max_score = max(max_score, score) if len(list_place_id) > 0 and max_score < 70: return ['', '', ret_logo_key, '', ''] # get all buildings around position list_building_info = [] for i in range(len(list_location)): ret_buildings = self.get_request_google_nearby([list_location[i]['lat'], list_location[i]['lng']]) if ret_buildings is not None: ret_building_json = json.loads(ret_buildings) if ret_building_json['status'] == 'OK': for j in range(len(ret_building_json['results'])): list_building_info.append(ret_building_json['results'][j]) # get best match result if not list_building_info: return [ret_address, '', ret_logo_key, '', ''] max_score = 0 max_building = None for i in range(len(list_building_info)): score = fuzz.ratio(ret_name.lower(), list_building_info[i]['name'].lower()) if score >= max_score: max_score = score max_building = list_building_info[i] if max_score > 50: return [max_building['vicinity'], max_building['icon'], max_building['name'], max_building['types'], max_building['place_id']] else: return [ret_address, '', ret_logo_key, '', '']
class ReceiptExtractor: def __init__(self): self.class_func = FuncMl() self.class_receipt_merchant = ReceiptMerchantExtractor() self.class_receipt_amount = ReceiptAmountExtractor() self.class_field_extractor = FieldExtractor() self.card_type = self.class_func.load_json( os.path.join(self.class_func.my_dir, 'config', 'card_type.json')) self.prev_ocr = None self.prev_name = None self.prev_type = None self.prev_address = None self.prev_remote_id = None def merchant_extract(self, img_file, ocr_json, ret_type): if self.prev_ocr == ocr_json: ret_address = self.prev_address ret_name = self.prev_name ret_type_list = self.prev_type ret_remote_id = self.prev_remote_id else: ret_merchant = self.class_receipt_merchant.extract( filename=img_file, ocr_json=ocr_json) [ret_address, _, ret_name, ret_type_list, ret_remote_id] = ret_merchant if len(ret_name) > 50: ret_name = None self.prev_ocr = ocr_json self.prev_address = ret_address self.prev_remote_id = ret_remote_id self.prev_name = ret_name self.prev_type = ret_type_list if ret_type == 'address': return ret_address elif ret_type == 'name': return ret_name elif ret_type == 'type_list': return ret_type_list elif ret_type == 'remote_id': return ret_remote_id else: return None def extract_address(self, filename, ocr_json=None): img_list, temp_list, json_list = self.class_func.get_img_list(filename) if ocr_json is None: if json_list and json_list[0] is not None: ocr_json = self.class_func.merge_ocr_json(json_list) else: ocr_json = self.class_func.get_json_google_from_jpg( img_list[0]) ret = self.merchant_extract(img_list[0], ocr_json, 'address') for temp_file in temp_list: self.class_func.rm_file(temp_file) return ret def extract_remote_id(self, filename, ocr_json=None): img_list, temp_list, json_list = self.class_func.get_img_list(filename) if ocr_json is None: if json_list and json_list[0] is not None: ocr_json = self.class_func.merge_ocr_json(json_list) else: ocr_json = self.class_func.get_json_google_from_jpg( img_list[0]) ret = self.merchant_extract(img_list[0], ocr_json, 'remote_id') for temp_file in temp_list: self.class_func.rm_file(temp_file) return ret def extract_remote_categories(self, filename, ocr_json=None): img_list, temp_list, json_list = self.class_func.get_img_list(filename) if ocr_json is None: if json_list and json_list[0] is not None: ocr_json = self.class_func.merge_ocr_json(json_list) else: ocr_json = self.class_func.get_json_google_from_jpg( img_list[0]) ret = self.merchant_extract(img_list[0], ocr_json, 'type_list') for temp_file in temp_list: self.class_func.rm_file(temp_file) return ret def get_card_info(self, ocr_json): card_type = None card_pos_list = [] for i in range(1, len(ocr_json) - 1): text1 = ocr_json[i]['description'] text2 = ocr_json[i]['description'] + ' ' + ocr_json[ i + 1]['description'] for card in self.card_type: if text1.upper() in self.card_type[card]: card_pos_list.append(i + 1) card_type = card break elif text2.upper() in self.card_type[card]: card_pos_list.append(i + 2) card_type = card break return card_type, card_pos_list def extract_card_type(self, filename, ocr_json=None): # ------------------ Get ocr data and remove temp file ---------------- img_list, temp_list, json_list = self.class_func.get_img_list(filename) if ocr_json is None: if json_list and json_list[0] is not None: ocr_json = self.class_func.merge_ocr_json(json_list) else: ocr_json = self.class_func.get_json_google_from_jpg( img_list[0]) for temp_file in temp_list: self.class_func.rm_file(temp_file) # ------------------------Extract the card type ------------------------ card_type, _ = self.get_card_info(ocr_json) return card_type def extract_card_number(self, filename, ocr_json=None): # ------------------ Get ocr data and remove temp file ---------------- img_list, temp_list, json_list = self.class_func.get_img_list(filename) if ocr_json is None: if json_list and json_list[0] is not None: ocr_json = self.class_func.merge_ocr_json(json_list) else: ocr_json = self.class_func.get_json_google_from_jpg( img_list[0]) for temp_file in temp_list: self.class_func.rm_file(temp_file) # --------------------- Extract the Card Number ----------------------- for i in range(1, len(ocr_json)): text = ocr_json[i]['description'] if len(text) > 11 and text[-4:].isdigit(): text_front = ''.join(set(text[1:-5])) if text_front == 'X' or text_front == 'x' or text_front == '*': return text[-4:] if i > 10 and len( text) == 4 and text.isdigit(): # '* * * * * 4075' f_num = True for j in range(i - 10, i): if ocr_json[j]['description'] != '*': f_num = False if f_num: return text if i > 3 and len( text) == 4 and text.isdigit(): # 'XXXX XXXX XXXX 4075' f_num = True for j in range(i - 3, i): if ocr_json[j]['description'] != 'XXXX': f_num = False if f_num: return text if i > 3 and len(text) == 4 and text.isdigit(): # 'ending in 4075' if ocr_json[i - 2]['description'] == 'ending' and ocr_json[ i - 1]['description'].lower() == 'in': return text # ---------------- Special case - 'Visa 2345 (Swipe)'------------------ card_type, card_pos_list = self.get_card_info(ocr_json) if card_type is None: return None for card_pos in card_pos_list: # Check text for card number if len(ocr_json[card_pos]['description'] ) == 4 and ocr_json[card_pos]['description'].isdigit(): # Check region rect1 = self.class_func.get_rect_ocr_data( ocr_json, card_pos - 1) rect2 = self.class_func.get_rect_ocr_data(ocr_json, card_pos) if rect1[0] + rect2[0] < 2 * rect1[2] and abs( rect1[1] - rect2[1]) < int((rect1[3] - rect1[1]) / 2): return ocr_json[card_pos]['description'] return None def extract_field(self, filename, field_id, ocr_json=None): img_list, temp_list, json_list = self.class_func.get_img_list(filename) if ocr_json is None: if json_list and json_list[0] is not None: ocr_json = self.class_func.merge_ocr_json(json_list) else: ocr_json = self.class_func.get_json_google_from_jpg( img_list[0]) ret = None if field_id == constant.RECEIPT_MERCHANT_ID: ret = self.merchant_extract(img_list[0], ocr_json, 'name') elif field_id == constant.RECEIPT_AMOUNT_ID: ret = self.class_receipt_amount.extract(img_list[0], ocr_json) elif field_id == constant.RECEIPT_TAX_ID: hint_list = [[[ "proximity", { "type": "same_line_prefix", "text": "SALES TAX" } ]], [ ["proximity", { "type": "same_line_suffix", "text": "TAX CA" }] ], [["proximity", { "type": "same_line_prefix", "text": "TAX DUE" }]], [["proximity", { "type": "same_line_prefix", "text": "Tax" }]]] profile_hash = {u'width': 8.5, u'height': 11} field_hint = {"data_type": 'currency', "hints": hint_list} ret = self.class_field_extractor.extract_v2(img_list[0], profile_hash, field_hint, en_fuzzy=False, ocr_json_data=ocr_json) ret = ret[0]['value'] elif field_id == constant.RECEIPT_DATE_ID: hint_list = [[[ "proximity", { "type": "same_line_prefix", "text": "Credit Purchase" } ]], []] profile_hash = {u'width': 8.5, u'height': 11} field_hint = {"data_type": 'date', "hints": hint_list} ret = self.class_field_extractor.extract_v2(img_list[0], profile_hash, field_hint, en_fuzzy=False, ocr_json_data=ocr_json, select_first=True) ret = ret[0]['value'] for temp_file in temp_list: self.class_func.rm_file(temp_file) return {"field_id": field_id, "value": ret} def extract(self, filename, ocr_json=None): img_list, temp_list, json_list = self.class_func.get_img_list(filename) if ocr_json is None: if json_list and json_list[0] is not None: ocr_json = self.class_func.merge_ocr_json(json_list) else: ocr_json = self.class_func.get_json_google_from_jpg( img_list[0]) ret_extract = [] for i in range(6): ret_field = self.extract_field(img_list, i + 1, ocr_json) ret_extract.append(ret_field) for temp_file in temp_list: self.class_func.rm_file(temp_file) return ret_extract def extract_all_info(self, filename): img_list, temp_list, json_list = self.class_func.get_img_list(filename) if json_list and json_list[0] is not None: ocr_json = self.class_func.merge_ocr_json(json_list) else: ocr_json = self.class_func.get_json_google_from_jpg(img_list[0]) ret_fields = self.extract(img_list[0], ocr_json) ret_address = self.extract_address(img_list[0], ocr_json) ret_card_type = self.extract_card_type(img_list[0], ocr_json) ret_card_number = self.extract_card_number(img_list[0], ocr_json) ret_remote_id = self.extract_remote_id(img_list[0], ocr_json) ret_remote_categories = self.extract_remote_categories( img_list[0], ocr_json) for temp_file in temp_list: self.class_func.rm_file(temp_file) ret = { 'date': ret_fields[0]['value'], 'amount': ret_fields[1]['value'], 'merchant': ret_fields[4]['value'], # 'tax': ret_fields[5]['value'], 'address': ret_address, # 'card_type': ret_card_type, # 'card_number': ret_card_number, # 'remote_id': ret_remote_id, # 'remote_categories': ret_remote_categories } return ret
def __init__(self): self.google_ocr = GoogleOCR() self.class_func = FuncMl() self.class_date_extractor = DateExtractor() self.class_address_extractor = ReceiptAddressExtractorML()
class VendorExtractor: def __init__(self, mode='vendor', country='US'): self.class_func = FuncMl() self.class_field_extractor = FieldExtractor() self.class_text_extractor = TextExtractor() self.country = country self.mode = mode self.vendor_profile_list = self.class_func.load_vendor_profile(mode) if mode == 'vendor': self.vendor_profile = self.vendor_profile_list['VENDOR1'] elif mode == 'passport': self.vendor_profile = self.vendor_profile_list['PASSPORT1'] def extract(self, filename, field_id=None, ocr_json=None): img_list, temp_list, json_list = self.class_func.get_img_list(filename) if ocr_json is None: if json_list and json_list[0] is not None: ocr_json = self.class_func.merge_ocr_json(json_list) else: ocr_json = self.class_func.get_json_google_from_jpg( img_list[0]) result = [] if ocr_json is None: return result else: ret_json = self.class_func.get_line_rect(ocr_json) ocr_text_lines, ocr_rect_lines = ret_json['text'], ret_json['rect'] for hint_hash in self.vendor_profile['fields']: if field_id is None or hint_hash['field_id'] == field_id: if 'key_type' in hint_hash: key_type = hint_hash['key_type'] else: key_type = '' ret_val_text = self.class_text_extractor.extract_v1( ocr_text_lines, ocr_rect_lines, hint_hash["data_type"], hint_hash["keys"], key_type) if ret_val_text is None: ret_field = None ret_field = self.class_field_extractor.extract_v2( img_list[0], self.vendor_profile, [hint_hash], ocr_json_data=ocr_json, en_fuzzy=False, country=self.country)[0] else: ret_field = { 'field_id': hint_hash["field_id"], 'field_name': hint_hash['name'], 'value': ret_val_text } result.append(ret_field) if self.mode == 'vendor': ret_section = [] for i in range(len(ocr_text_lines)): text_line = ocr_text_lines[i] if len(text_line) > 6 and text_line[:6].isdigit( ) and text_line[6] == ':' and text_line[:6] not in ret_section: ret_section.append(text_line[:6]) ret_field = { 'field_id': 0, 'field_name': 'Section', 'value': ret_section } result.append(ret_field) for temp_file in temp_list: self.class_func.rm_file(temp_file) return result def get_date_mrp(self, text, mode='birth'): yy = text[:2] mm = text[2:4] dd = text[4:] if mode == 'expire': yy = '20' + yy elif mode == 'birth': cur_y = str(datetime.today().year)[2:] if int(yy) < int(cur_y): yy = '20' + yy else: yy = '19' + yy return yy + '-' + mm + '-' + dd def extract_MRP(self, filename, ocr_json=None, mode='passport'): img_list, temp_list, json_list = self.class_func.get_img_list(filename) if ocr_json is None: if json_list and json_list[0] is not None: ocr_json = self.class_func.merge_ocr_json(json_list) else: ocr_json = self.class_func.get_json_google_from_jpg( img_list[0]) for temp_file in temp_list: self.class_func.rm_file(temp_file) result = [] if ocr_json is None: return result else: ret_json = self.class_func.get_line_rect(ocr_json) ocr_text_lines, ocr_rect_lines = ret_json['text'], ret_json['rect'] line1 = '' line2 = '' for i in range(len(ocr_text_lines) - 1): if ocr_text_lines[i].count('<') > 4 and ocr_text_lines[ i + 1].count('<') > 4: line1 = ocr_text_lines[i] line2 = ocr_text_lines[i + 1] break if line1 == '': return result line1 = line1.replace(' ', '') line2 = line2.replace(' ', '') if len(line1) != 44 or len(line2) != 44: return result # print line1 # print line2 if mode == 'passport': if line1[0] != 'P': if line2[0] == 'P': line1, line2 = line2, line1 else: return result ret_type = line1[1] ret_country = line1[2:5] ret_name = line1[5:] if ret_type != '<': result.append({'field_name': 'Type', 'value': ret_type}) result.append({'field_name': 'Country', 'value': ret_country}) ret_name = ret_name.replace('<', ' ').strip() if ret_name.__contains__(' '): result.append({ 'field_name': 'Surname', 'value': ret_name.split(' ')[0] }) result.append({ 'field_name': 'Given name', 'value': ret_name.split(' ')[1] }) else: result.append({'field_name': 'Full Name', 'value': ret_name}) ret_no = line2[:9] ret_national = line2[10:13] ret_date_birth = line2[13:19] ret_sex = line2[20] ret_date_expire = line2[21:27] ret_personal = line2[28:42] # print ret_no ret_no = ret_no.replace('<', '') result.append({'field_name': 'Passport Number', 'value': ret_no}) # print ret_national ret_national = ret_national.replace('<', '') if ret_national != '': result.append({'field_name': 'Nationality', 'value': ret_national}) result.append({ 'field_name': 'Date of Birth', 'value': self.get_date_mrp(ret_date_birth, 'birth') }) result.append({ 'field_name': 'Date of Expire', 'value': self.get_date_mrp(ret_date_expire, 'expire') }) if ret_sex != '<': result.append({'field_name': 'Sex', 'value': ret_sex}) ret_personal = ret_personal.replace('<', '') result.append({'field_name': 'Personal Number', 'value': ret_personal}) return result