def get_pt(all_words, height, width, words):
    pt = []
    for word in all_words:
        if word['text'].isdigit():
            if 0 < int(word['text']) < 33:
                cad = {
                    'text': word['text'],
                    'x1': word['left'],
                    'y1': word['top'],
                    'x2': word['left'] + word['width'],
                    'y2': word['top'] + word['height']
                }
                neighbours = Neighbour.find_neighbour(cad, words,
                                                      int(width * 0.3),
                                                      int(height * 0.02),
                                                      width, height)
                neighbours_text = [
                    it['text'].lower().replace('.', '') for it in neighbours
                ]

                if 'net' in neighbours_text or 'terms' in neighbours_text or (
                        'due' in neighbours_text
                        and 'days' in neighbours_text):
                    pt.append({
                        'text': word['text'],
                        'x1': word['left'],
                        'y1': word['top'],
                        'x2': word['left'] + word['width'],
                        'y2': word['top'] + word['height']
                    })
    return pt
Пример #2
0
 def __init__(self, split_name='train'):
     """ Initialize the dataset with preprocessing """
     cached_data_path = config.OUTPUT_DIR / f"cached_data_{split_name}.pickle"
     if cached_data_path.exists():
         print("Preprocessed data available, Loading data from cache...")
         with open(cached_data_path, "rb") as f:
             cached_data = pickle.load(f)
         classes_count = cached_data['count']
         class_mapping = cached_data['mapping']
         print("\nClass Mapping:", class_mapping)
         print("Classs counts:", classes_count)
         _data = cached_data['data']
         self.vocab = cached_data['vocab']
         self.field_ids, self.candidate_cords, self.neighbours, self.neighbour_cords, self.mask, self.labels = _data
     else:
         print("Preprocessed data not available")
         annotation, classes_count, class_mapping = xml_parser.get_data(config.XML_DIR, split_name)
         print("Class Mapping:", class_mapping)
         print("Classs counts:", classes_count)
         annotation = candidate.attach_candidate(annotation, config.CANDIDATE_DIR)
         annotation, self.vocab = Neighbour.attach_neighbour(annotation, config.OCR_DIR, config.IMAGE_DIR, vocab_size=config.VOCAB_SIZE)
         annotation = op.normalize_positions(annotation)
         _data = preprocess.parse_input(annotation, class_mapping, config.NEIGHBOURS, self.vocab)
         self.field_ids, self.candidate_cords, self.neighbours, self.neighbour_cords, self.mask, self.labels = _data
         cached_data = {'count': classes_count, "mapping": class_mapping, 'vocab': self.vocab, 'data': _data}
         print("Saving Cache..")
         with open(cached_data_path, 'wb') as f:
             pickle.dump(cached_data, f, protocol=pickle.HIGHEST_PROTOCOL)
         print("Done !!")
def get_subtotal(all_words, height, width, words):
    amounts = []
    for word in all_words:
        if word['text'][0] == '0' and word['text'].isdigit():
            continue
        try:
            formatted_word = re.sub(r'[$,]', '', word['text'])
            if not any(char.isdigit() for char in formatted_word):
                continue
            if '-' in word['text']:
                continue
            if re.search('[a-zA-Z]', word['text']):
                continue
            if '/' in word['text']:
                continue
            if len(word['text']) > 15:
                continue
            if word['text'].isdigit() and len(word['text']) > 4:
                continue
            if '(' in word['text']:
                continue
            if '%' in word['text']:
                continue
            if len(word['text'].split('.')) == 3:
                if len(word['text'].split('.')[2]) == 4:
                    continue

            cad = {
                'text': word['text'],
                'x1': word['left'],
                'y1': word['top'],
                'x2': word['left'] + word['width'],
                'y2': word['top'] + word['height']
            }
            neighbours = Neighbour.find_neighbour(cad, words, int(width * 0.5),
                                                  int(height * 0.01), width,
                                                  height)

            neighbours_text = [it['text'].lower() for it in neighbours]

            if 'subtotal' in neighbours_text:
                amounts.append({
                    'text': word['text'],
                    'x1': word['left'],
                    'y1': word['top'],
                    'x2': word['left'] + word['width'],
                    'y2': word['top'] + word['height']
                })

        except ValueError:
            continue

    return amounts
def get_invoice_nums(all_words, height, width, words):
    inv_nums = []
    invoice_no_re = r'^[0-9a-zA-Z-]+$'
    for word in all_words:
        if not re.search('\d', word['text']):
            continue
        if len(word['text']) < 2:
            continue
        if len(word['text'].split('-')) > 2:
            continue
        result = re.findall(invoice_no_re, word['text'])
        if result:
            cad = {
                'text': word['text'],
                'x1': word['left'],
                'y1': word['top'],
                'x2': word['left'] + word['width'],
                'y2': word['top'] + word['height']
            }
            neighbours = Neighbour.find_neighbour(cad, words, int(width * 0.3),
                                                  int(height * 0.02), width,
                                                  height)

            neighbours_text_clean = [
                it['text'].lower().replace('#', '').replace(':', '')
                for it in neighbours
            ]
            neighbours_text = [it['text'].lower() for it in neighbours]

            if not neighbours:
                inv_nums.append({
                    'text': word['text'],
                    'x1': word['left'],
                    'y1': word['top'],
                    'x2': word['left'] + word['width'],
                    'y2': word['top'] + word['height']
                })

            if 'invoice' in neighbours_text_clean or '#' in neighbours_text:
                inv_nums.append({
                    'text': word['text'],
                    'x1': word['left'],
                    'y1': word['top'],
                    'x2': word['left'] + word['width'],
                    'y2': word['top'] + word['height']
                })

    return inv_nums
def attach_neighbour_candidates(width, height, ocr_data, candidates):
    empty_index = [i for i, ele in enumerate(ocr_data['text']) if ele == ""]
    for key in ocr_data.keys():
        ocr_data[key] = [
            j for i, j in enumerate(ocr_data[key]) if i not in empty_index
        ]
    words = []
    for txt, x, y, w, h in zip(ocr_data['text'], ocr_data['left'],
                               ocr_data['top'], ocr_data['width'],
                               ocr_data['height']):
        x2 = x + w
        y2 = y + h
        words.append({'text': txt, 'x1': x, 'y1': y, 'x2': x2, 'y2': y2})
    x_offset = int(width * 0.1)
    y_offset = int(height * 0.1)
    for cls, both_cads in candidates.items():
        for cad in both_cads:
            neighbours = Neighbour.find_neighbour(cad, words, x_offset,
                                                  y_offset, width, height)
            cad['neighbours'] = neighbours
    return candidates
Пример #6
0
def attach_neighbour_candidates(width, height, ocr_data, candidates):
    # empty_index = [i for i, ele in enumerate(ocr_data['text']) if ele == ""]
    # for key in ocr_data.keys():
    #     ocr_data[key] = [j for i, j in enumerate(ocr_data[key]) if i not in empty_index]
    # words = []
    # for txt, x, y, w, h in zip(ocr_data['text'], ocr_data['left'], ocr_data['top'], ocr_data['width'],
    #                            ocr_data['height']):
    #     x2 = x + w
    #     y2 = y + h
    #     words.append({'text': txt, 'x1': x, 'y1': y, 'x2': x2, 'y2': y2})
    BlockType_word = [
        item for item in ocr_data['Blocks'] if item['BlockType'] == 'WORD'
    ]
    words = []
    for item in BlockType_word:
        words.append({
            'text':
            item['Text'],
            'x1':
            item['Geometry']['BoundingBox']['Left'] * width,
            'y1':
            item['Geometry']['BoundingBox']['Top'] * height,
            'x2':
            item['Geometry']['BoundingBox']['Left'] * width +
            item['Geometry']['BoundingBox']['Width'] * width,
            'y2':
            item['Geometry']['BoundingBox']['Height'] * height +
            item['Geometry']['BoundingBox']['Top'] * height
        })

    x_offset = int(width * 0.3)
    y_offset = int(height * 0.02)
    for cls, both_cads in candidates.items():
        for cad in both_cads:
            neighbours = Neighbour.find_neighbour(cad, words, x_offset,
                                                  y_offset, width, height)
            cad['neighbours'] = neighbours
    return candidates
def get_ship_to(lines, height, width, words):
    ship_to = []
    regexp = ".+ [0-9]{1,4} .+ [A-Z]{2} [0-9]{5}"
    coord_left = []
    for l in lines:
        coord_left.append(l['bbox'][0])

    dups = defaultdict(list)
    for i, e in enumerate(coord_left):
        dups[e].append(i)

    index_list = []
    x1_list = []
    y1_list = []
    x2_list = []
    y2_list = []
    key_list = []
    for key in sorted(dups):
        if not index_list:
            key_list.append(key)
            index_list.append(dups[key])
            index_list = flatten_list(index_list)
            for item in index_list:
                x1_list.append(lines[item]['bbox'][0])
                y1_list.append(lines[item]['bbox'][1])
                x2_list.append(lines[item]['bbox'][2])
                y2_list.append(lines[item]['bbox'][3])
        else:
            if key - key_list[-1] == 1:
                key_list.append(key)
                index_list.append(dups[key])
                index_list = flatten_list(index_list)
                index_list = sorted(index_list)
                pop_ind = []
                rem_list = []
                for ind in range(1, len(index_list)):
                    delta_y1 = (lines[index_list[ind]]['bbox'][1] - \
                               lines[index_list[ind - 1]]['bbox'][1])/height
                    if delta_y1 < 0.021:
                        pop_ind.append(index_list[ind - 1])
                        pop_ind.append(index_list[ind])
                        pop_ind = list(set(pop_ind))
                        text_0 = ' '.join([lines[i]['text'] for i in pop_ind])
                        text_0 = re.sub(' +', ' ', text_0)
                        x1_list.append(lines[index_list[ind - 1]]['bbox'][0])
                        y1_list.append(lines[index_list[ind - 1]]['bbox'][1])
                        x2_list.append(lines[index_list[ind - 1]]['bbox'][2])
                        y2_list.append(lines[index_list[ind - 1]]['bbox'][3])
                        bb_coord_0 = [
                            min(x1_list),
                            min(y1_list),
                            max(x2_list),
                            max(y2_list)
                        ]
                        if re.findall(regexp, text_0):
                            cad = {
                                'text': text_0,
                                'x1': bb_coord_0[0],
                                'y1': bb_coord_0[1],
                                'x2': bb_coord_0[2],
                                'y2': bb_coord_0[3]
                            }
                            neighbours = Neighbour.find_neighbour(
                                cad, words, int(width * 0.2),
                                int(height * 0.01), width, height)

                            neighbours_text = [
                                it['text'].lower() for it in neighbours
                            ]

                            if 'ship' in neighbours_text and 'bill' not in neighbours_text:
                                ship_to.append(cad)
                    else:
                        txt = lines[index_list[ind - 1]]['text']
                        x1 = [lines[index_list[ind - 1]]['bbox'][0]]
                        y1 = [lines[index_list[ind - 1]]['bbox'][1]]
                        x2 = [lines[index_list[ind - 1]]['bbox'][2]]
                        y2 = [lines[index_list[ind - 1]]['bbox'][3]]
                        bb_coord = [min(x1), min(y1), max(x2), max(y2)]
                        rem_list.append(index_list[ind - 1])
                        if re.findall(regexp, txt):
                            cad = {
                                'text': txt,
                                'x1': bb_coord[0],
                                'y1': bb_coord[1],
                                'x2': bb_coord[2],
                                'y2': bb_coord[3]
                            }
                            neighbours = Neighbour.find_neighbour(
                                cad, words, int(width * 0.2),
                                int(height * 0.01), width, height)

                            neighbours_text = [
                                it['text'].lower() for it in neighbours
                            ]

                            if 'ship' in neighbours_text and 'bill' not in neighbours_text:
                                ship_to.append(cad)

                txt = lines[index_list[-1]]['text']
                x1 = [lines[index_list[-1]]['bbox'][0]]
                y1 = [lines[index_list[-1]]['bbox'][1]]
                x2 = [lines[index_list[-1]]['bbox'][2]]
                y2 = [lines[index_list[-1]]['bbox'][3]]
                bb_coord = [min(x1), min(y1), max(x2), max(y2)]
                if re.findall(regexp, txt):
                    rem_list.append(index_list[-1])
                    cad = {
                        'text': txt,
                        'x1': bb_coord[0],
                        'y1': bb_coord[1],
                        'x2': bb_coord[2],
                        'y2': bb_coord[3]
                    }
                    neighbours = Neighbour.find_neighbour(
                        cad, words, int(width * 0.2), int(height * 0.01),
                        width, height)

                    neighbours_text = [it['text'].lower() for it in neighbours]

                    if 'ship' in neighbours_text and 'bill' not in neighbours_text:
                        ship_to.append(cad)
                for ind in rem_list:
                    index_list.remove(ind)
            else:
                if len(index_list) == 1:
                    index_list = flatten_list(index_list)
                text = ' '.join([lines[i]['text'] for i in index_list])
                text = re.sub(' +', ' ', text)
                coord = [
                    min(x1_list),
                    min(y1_list),
                    max(x2_list),
                    max(y2_list)
                ]
                if re.findall(regexp, text):
                    cad = {
                        'text': text,
                        'x1': coord[0],
                        'y1': coord[1],
                        'x2': coord[2],
                        'y2': coord[3]
                    }
                    neighbours = Neighbour.find_neighbour(
                        cad, words, int(width * 0.2), int(height * 0.01),
                        width, height)

                    neighbours_text = [it['text'].lower() for it in neighbours]

                    if 'ship' in neighbours_text and 'bill' not in neighbours_text:
                        ship_to.append(cad)
                key_list = [key]
                index_list = [dups[key]]
                for item in dups[key]:
                    x1_list = [lines[item]['bbox'][0]]
                    y1_list = [lines[item]['bbox'][1]]
                    x2_list = [lines[item]['bbox'][2]]
                    y2_list = [lines[item]['bbox'][3]]

    return ship_to
def due_dates(all_text, all_words, height, width, words):
    dates, all_dates = [], []
    indices = []
    matches = search_dates(all_text)

    for match in matches:
        text = match[0]

        if len(text.strip()) < 8:
            continue
        if '$' in text:
            continue
        if 'day' in text:
            continue
        if '%' in text:
            continue
        if '#' in text:
            continue
        if ':' in text:
            continue
        if 'hour' in text.lower():
            continue
        if (len(text) > 10) and not (re.search('[a-zA-Z]', text)):
            continue
        if len(text) > 15:
            continue
        if text.isdigit():
            continue
        if '(' in text:
            continue

        token_length = len(text.split(' '))
        idx = all_text.find(match[0])
        text_len = len(text)
        index = len(all_text[:idx].strip().split(' '))
        if idx == 0:
            index = 0

        replaced_text = ' '.join(['*' * len(i) for i in text.split(' ')])

        indices.append(list(range(index, index + token_length)))

        index += token_length
        all_text = all_text[:idx + text_len].replace(
            text, replaced_text) + all_text[idx + text_len:]

    for date_indices in indices:
        date = ''
        left, top, right, bottom = [], [], [], []
        for i in date_indices:
            date += ' ' + all_words[i]['text']
            left.append(all_words[i]['left'])
            top.append(all_words[i]['top'])
            right.append(all_words[i]['left'] + all_words[i]['width'])
            bottom.append(all_words[i]['top'] + all_words[i]['height'])
        cad = {
            'text': all_words[i]['text'],
            'x1': all_words[i]['left'],
            'y1': all_words[i]['top'],
            'x2': all_words[i]['left'] + all_words[i]['width'],
            'y2': all_words[i]['top'] + all_words[i]['height']
        }
        neighbours = Neighbour.find_neighbour(cad, words, int(width * 0.3),
                                              int(height * 0.02), width,
                                              height)

        for item in neighbours:
            if 'due' in item['text'].lower():
                all_dates.append({
                    'text': date.strip(),
                    'x1': min(left),
                    'y1': min(top),
                    'x2': max(right),
                    'y2': max(bottom)
                })

    return all_dates