def get_pt(all_words, height, width, words): pt = [] for word in all_words: if word['text'].isdigit(): if 0 < int(word['text']) < 33: cad = { 'text': word['text'], 'x1': word['left'], 'y1': word['top'], 'x2': word['left'] + word['width'], 'y2': word['top'] + word['height'] } neighbours = Neighbour.find_neighbour(cad, words, int(width * 0.3), int(height * 0.02), width, height) neighbours_text = [ it['text'].lower().replace('.', '') for it in neighbours ] if 'net' in neighbours_text or 'terms' in neighbours_text or ( 'due' in neighbours_text and 'days' in neighbours_text): pt.append({ 'text': word['text'], 'x1': word['left'], 'y1': word['top'], 'x2': word['left'] + word['width'], 'y2': word['top'] + word['height'] }) return pt
def __init__(self, split_name='train'): """ Initialize the dataset with preprocessing """ cached_data_path = config.OUTPUT_DIR / f"cached_data_{split_name}.pickle" if cached_data_path.exists(): print("Preprocessed data available, Loading data from cache...") with open(cached_data_path, "rb") as f: cached_data = pickle.load(f) classes_count = cached_data['count'] class_mapping = cached_data['mapping'] print("\nClass Mapping:", class_mapping) print("Classs counts:", classes_count) _data = cached_data['data'] self.vocab = cached_data['vocab'] self.field_ids, self.candidate_cords, self.neighbours, self.neighbour_cords, self.mask, self.labels = _data else: print("Preprocessed data not available") annotation, classes_count, class_mapping = xml_parser.get_data(config.XML_DIR, split_name) print("Class Mapping:", class_mapping) print("Classs counts:", classes_count) annotation = candidate.attach_candidate(annotation, config.CANDIDATE_DIR) annotation, self.vocab = Neighbour.attach_neighbour(annotation, config.OCR_DIR, config.IMAGE_DIR, vocab_size=config.VOCAB_SIZE) annotation = op.normalize_positions(annotation) _data = preprocess.parse_input(annotation, class_mapping, config.NEIGHBOURS, self.vocab) self.field_ids, self.candidate_cords, self.neighbours, self.neighbour_cords, self.mask, self.labels = _data cached_data = {'count': classes_count, "mapping": class_mapping, 'vocab': self.vocab, 'data': _data} print("Saving Cache..") with open(cached_data_path, 'wb') as f: pickle.dump(cached_data, f, protocol=pickle.HIGHEST_PROTOCOL) print("Done !!")
def get_subtotal(all_words, height, width, words): amounts = [] for word in all_words: if word['text'][0] == '0' and word['text'].isdigit(): continue try: formatted_word = re.sub(r'[$,]', '', word['text']) if not any(char.isdigit() for char in formatted_word): continue if '-' in word['text']: continue if re.search('[a-zA-Z]', word['text']): continue if '/' in word['text']: continue if len(word['text']) > 15: continue if word['text'].isdigit() and len(word['text']) > 4: continue if '(' in word['text']: continue if '%' in word['text']: continue if len(word['text'].split('.')) == 3: if len(word['text'].split('.')[2]) == 4: continue cad = { 'text': word['text'], 'x1': word['left'], 'y1': word['top'], 'x2': word['left'] + word['width'], 'y2': word['top'] + word['height'] } neighbours = Neighbour.find_neighbour(cad, words, int(width * 0.5), int(height * 0.01), width, height) neighbours_text = [it['text'].lower() for it in neighbours] if 'subtotal' in neighbours_text: amounts.append({ 'text': word['text'], 'x1': word['left'], 'y1': word['top'], 'x2': word['left'] + word['width'], 'y2': word['top'] + word['height'] }) except ValueError: continue return amounts
def get_invoice_nums(all_words, height, width, words): inv_nums = [] invoice_no_re = r'^[0-9a-zA-Z-]+$' for word in all_words: if not re.search('\d', word['text']): continue if len(word['text']) < 2: continue if len(word['text'].split('-')) > 2: continue result = re.findall(invoice_no_re, word['text']) if result: cad = { 'text': word['text'], 'x1': word['left'], 'y1': word['top'], 'x2': word['left'] + word['width'], 'y2': word['top'] + word['height'] } neighbours = Neighbour.find_neighbour(cad, words, int(width * 0.3), int(height * 0.02), width, height) neighbours_text_clean = [ it['text'].lower().replace('#', '').replace(':', '') for it in neighbours ] neighbours_text = [it['text'].lower() for it in neighbours] if not neighbours: inv_nums.append({ 'text': word['text'], 'x1': word['left'], 'y1': word['top'], 'x2': word['left'] + word['width'], 'y2': word['top'] + word['height'] }) if 'invoice' in neighbours_text_clean or '#' in neighbours_text: inv_nums.append({ 'text': word['text'], 'x1': word['left'], 'y1': word['top'], 'x2': word['left'] + word['width'], 'y2': word['top'] + word['height'] }) return inv_nums
def attach_neighbour_candidates(width, height, ocr_data, candidates): empty_index = [i for i, ele in enumerate(ocr_data['text']) if ele == ""] for key in ocr_data.keys(): ocr_data[key] = [ j for i, j in enumerate(ocr_data[key]) if i not in empty_index ] words = [] for txt, x, y, w, h in zip(ocr_data['text'], ocr_data['left'], ocr_data['top'], ocr_data['width'], ocr_data['height']): x2 = x + w y2 = y + h words.append({'text': txt, 'x1': x, 'y1': y, 'x2': x2, 'y2': y2}) x_offset = int(width * 0.1) y_offset = int(height * 0.1) for cls, both_cads in candidates.items(): for cad in both_cads: neighbours = Neighbour.find_neighbour(cad, words, x_offset, y_offset, width, height) cad['neighbours'] = neighbours return candidates
def attach_neighbour_candidates(width, height, ocr_data, candidates): # empty_index = [i for i, ele in enumerate(ocr_data['text']) if ele == ""] # for key in ocr_data.keys(): # ocr_data[key] = [j for i, j in enumerate(ocr_data[key]) if i not in empty_index] # words = [] # for txt, x, y, w, h in zip(ocr_data['text'], ocr_data['left'], ocr_data['top'], ocr_data['width'], # ocr_data['height']): # x2 = x + w # y2 = y + h # words.append({'text': txt, 'x1': x, 'y1': y, 'x2': x2, 'y2': y2}) BlockType_word = [ item for item in ocr_data['Blocks'] if item['BlockType'] == 'WORD' ] words = [] for item in BlockType_word: words.append({ 'text': item['Text'], 'x1': item['Geometry']['BoundingBox']['Left'] * width, 'y1': item['Geometry']['BoundingBox']['Top'] * height, 'x2': item['Geometry']['BoundingBox']['Left'] * width + item['Geometry']['BoundingBox']['Width'] * width, 'y2': item['Geometry']['BoundingBox']['Height'] * height + item['Geometry']['BoundingBox']['Top'] * height }) x_offset = int(width * 0.3) y_offset = int(height * 0.02) for cls, both_cads in candidates.items(): for cad in both_cads: neighbours = Neighbour.find_neighbour(cad, words, x_offset, y_offset, width, height) cad['neighbours'] = neighbours return candidates
def get_ship_to(lines, height, width, words): ship_to = [] regexp = ".+ [0-9]{1,4} .+ [A-Z]{2} [0-9]{5}" coord_left = [] for l in lines: coord_left.append(l['bbox'][0]) dups = defaultdict(list) for i, e in enumerate(coord_left): dups[e].append(i) index_list = [] x1_list = [] y1_list = [] x2_list = [] y2_list = [] key_list = [] for key in sorted(dups): if not index_list: key_list.append(key) index_list.append(dups[key]) index_list = flatten_list(index_list) for item in index_list: x1_list.append(lines[item]['bbox'][0]) y1_list.append(lines[item]['bbox'][1]) x2_list.append(lines[item]['bbox'][2]) y2_list.append(lines[item]['bbox'][3]) else: if key - key_list[-1] == 1: key_list.append(key) index_list.append(dups[key]) index_list = flatten_list(index_list) index_list = sorted(index_list) pop_ind = [] rem_list = [] for ind in range(1, len(index_list)): delta_y1 = (lines[index_list[ind]]['bbox'][1] - \ lines[index_list[ind - 1]]['bbox'][1])/height if delta_y1 < 0.021: pop_ind.append(index_list[ind - 1]) pop_ind.append(index_list[ind]) pop_ind = list(set(pop_ind)) text_0 = ' '.join([lines[i]['text'] for i in pop_ind]) text_0 = re.sub(' +', ' ', text_0) x1_list.append(lines[index_list[ind - 1]]['bbox'][0]) y1_list.append(lines[index_list[ind - 1]]['bbox'][1]) x2_list.append(lines[index_list[ind - 1]]['bbox'][2]) y2_list.append(lines[index_list[ind - 1]]['bbox'][3]) bb_coord_0 = [ min(x1_list), min(y1_list), max(x2_list), max(y2_list) ] if re.findall(regexp, text_0): cad = { 'text': text_0, 'x1': bb_coord_0[0], 'y1': bb_coord_0[1], 'x2': bb_coord_0[2], 'y2': bb_coord_0[3] } neighbours = Neighbour.find_neighbour( cad, words, int(width * 0.2), int(height * 0.01), width, height) neighbours_text = [ it['text'].lower() for it in neighbours ] if 'ship' in neighbours_text and 'bill' not in neighbours_text: ship_to.append(cad) else: txt = lines[index_list[ind - 1]]['text'] x1 = [lines[index_list[ind - 1]]['bbox'][0]] y1 = [lines[index_list[ind - 1]]['bbox'][1]] x2 = [lines[index_list[ind - 1]]['bbox'][2]] y2 = [lines[index_list[ind - 1]]['bbox'][3]] bb_coord = [min(x1), min(y1), max(x2), max(y2)] rem_list.append(index_list[ind - 1]) if re.findall(regexp, txt): cad = { 'text': txt, 'x1': bb_coord[0], 'y1': bb_coord[1], 'x2': bb_coord[2], 'y2': bb_coord[3] } neighbours = Neighbour.find_neighbour( cad, words, int(width * 0.2), int(height * 0.01), width, height) neighbours_text = [ it['text'].lower() for it in neighbours ] if 'ship' in neighbours_text and 'bill' not in neighbours_text: ship_to.append(cad) txt = lines[index_list[-1]]['text'] x1 = [lines[index_list[-1]]['bbox'][0]] y1 = [lines[index_list[-1]]['bbox'][1]] x2 = [lines[index_list[-1]]['bbox'][2]] y2 = [lines[index_list[-1]]['bbox'][3]] bb_coord = [min(x1), min(y1), max(x2), max(y2)] if re.findall(regexp, txt): rem_list.append(index_list[-1]) cad = { 'text': txt, 'x1': bb_coord[0], 'y1': bb_coord[1], 'x2': bb_coord[2], 'y2': bb_coord[3] } neighbours = Neighbour.find_neighbour( cad, words, int(width * 0.2), int(height * 0.01), width, height) neighbours_text = [it['text'].lower() for it in neighbours] if 'ship' in neighbours_text and 'bill' not in neighbours_text: ship_to.append(cad) for ind in rem_list: index_list.remove(ind) else: if len(index_list) == 1: index_list = flatten_list(index_list) text = ' '.join([lines[i]['text'] for i in index_list]) text = re.sub(' +', ' ', text) coord = [ min(x1_list), min(y1_list), max(x2_list), max(y2_list) ] if re.findall(regexp, text): cad = { 'text': text, 'x1': coord[0], 'y1': coord[1], 'x2': coord[2], 'y2': coord[3] } neighbours = Neighbour.find_neighbour( cad, words, int(width * 0.2), int(height * 0.01), width, height) neighbours_text = [it['text'].lower() for it in neighbours] if 'ship' in neighbours_text and 'bill' not in neighbours_text: ship_to.append(cad) key_list = [key] index_list = [dups[key]] for item in dups[key]: x1_list = [lines[item]['bbox'][0]] y1_list = [lines[item]['bbox'][1]] x2_list = [lines[item]['bbox'][2]] y2_list = [lines[item]['bbox'][3]] return ship_to
def due_dates(all_text, all_words, height, width, words): dates, all_dates = [], [] indices = [] matches = search_dates(all_text) for match in matches: text = match[0] if len(text.strip()) < 8: continue if '$' in text: continue if 'day' in text: continue if '%' in text: continue if '#' in text: continue if ':' in text: continue if 'hour' in text.lower(): continue if (len(text) > 10) and not (re.search('[a-zA-Z]', text)): continue if len(text) > 15: continue if text.isdigit(): continue if '(' in text: continue token_length = len(text.split(' ')) idx = all_text.find(match[0]) text_len = len(text) index = len(all_text[:idx].strip().split(' ')) if idx == 0: index = 0 replaced_text = ' '.join(['*' * len(i) for i in text.split(' ')]) indices.append(list(range(index, index + token_length))) index += token_length all_text = all_text[:idx + text_len].replace( text, replaced_text) + all_text[idx + text_len:] for date_indices in indices: date = '' left, top, right, bottom = [], [], [], [] for i in date_indices: date += ' ' + all_words[i]['text'] left.append(all_words[i]['left']) top.append(all_words[i]['top']) right.append(all_words[i]['left'] + all_words[i]['width']) bottom.append(all_words[i]['top'] + all_words[i]['height']) cad = { 'text': all_words[i]['text'], 'x1': all_words[i]['left'], 'y1': all_words[i]['top'], 'x2': all_words[i]['left'] + all_words[i]['width'], 'y2': all_words[i]['top'] + all_words[i]['height'] } neighbours = Neighbour.find_neighbour(cad, words, int(width * 0.3), int(height * 0.02), width, height) for item in neighbours: if 'due' in item['text'].lower(): all_dates.append({ 'text': date.strip(), 'x1': min(left), 'y1': min(top), 'x2': max(right), 'y2': max(bottom) }) return all_dates