def test_skipping_first_regression(self): neg_logits = np.asarray([ [10.0, 10.0, 0.0], [0.0, 10.0, 10.0], ]) self.assertEqual(force_align(neg_logits, [1, 2], 0), [1, 2])
def test_trivial(self): neg_logits = np.asarray([ [0.0, 10.0], [10.0, 0.0] ]) self.assertEqual(force_align(neg_logits, [1], 0), [0, 1])
def test_multi_symbol_regression(self): neg_logits = np.asarray([ [0.0, 10.0, 10.0], [10.0, 10.0, 0.0], [5.0, 10.0, 5.0], [10.0, 10.0, 0.0], ]) self.assertEqual(force_align(neg_logits, [2, 2], 0), [0, 2, 0, 2])
def test_single_symbol_multi_blank(self): neg_logits = np.asarray([ [0.0, 10.0, 0.0], [0.0, 10.0, 0.0], [0.0, 10.0, 0.0], [10.0, 0.0, 10.0], [0.0, 10.0, 0.0], [0.0, 10.0, 0.0], ]) self.assertEqual(force_align(neg_logits, [1], 0), [0, 0, 0, 1, 0, 0])
def to_altoxml_string(self): NSMAP = { "xlink": 'http://www.w3.org/1999/xlink', "xsi": 'http://www.w3.org/2001/XMLSchema-instance' } root = ET.Element("alto", nsmap=NSMAP) root.set("xmlns", "http://www.loc.gov/standards/alto/ns-v2#") description = ET.SubElement(root, "Description") measurement_unit = ET.SubElement(description, "MeasurementUnit") measurement_unit.text = "pixel" ocr_processing = ET.SubElement(description, "OCRProcessing") ocr_processing.set("ID", "IdOcr") ocr_processing_step = ET.SubElement(ocr_processing, "ocrProcessingStep") processing_date_time = ET.SubElement(ocr_processing_step, "processingDateTime") processing_date_time.text = datetime.today().strftime('%Y-%m-%d') processing_software = ET.SubElement(ocr_processing_step, "processingSoftware") processing_creator = ET.SubElement(processing_software, "softwareCreator") processing_creator.text = "Project PERO" software_name = ET.SubElement(processing_software, "softwareName") software_name.text = "PERO OCR" software_version = ET.SubElement(processing_software, "softwareVersion") software_version.text = "v0.1.0" layout = ET.SubElement(root, "Layout") page = ET.SubElement(layout, "Page") page.set("ID", "id_" + self.id) page.set("PHYSICAL_IMG_NR", str(1)) page.set("HEIGHT", str(self.page_size[0])) page.set("WIDTH", str(self.page_size[1])) top_margin = ET.SubElement(page, "TopMargin") left_margin = ET.SubElement(page, "LeftMargin") right_margin = ET.SubElement(page, "RightMargin") bottom_margin = ET.SubElement(page, "BottomMargin") print_space = ET.SubElement(page, "PrintSpace") print_space_height = 0 print_space_width = 0 print_space_vpos = self.page_size[0] print_space_hpos = self.page_size[1] for b, block in enumerate(self.regions): text_block = ET.SubElement(print_space, "TextBlock") text_block.set("ID", block.id) text_block_height = max(block.polygon[:, 1]) - min( block.polygon[:, 1]) text_block.set("HEIGHT", str(text_block_height)) text_block_width = max(block.polygon[:, 0]) - min(block.polygon[:, 0]) text_block.set("WIDTH", str(text_block_width)) text_block_vpos = min(block.polygon[:, 1]) text_block.set("VPOS", str(text_block_vpos)) text_block_hpos = min(block.polygon[:, 0]) text_block.set("HPOS", str(text_block_hpos)) print_space_height = max([ print_space_vpos + print_space_height, text_block_vpos + text_block_height ]) print_space_width = max([ print_space_hpos + print_space_width, text_block_hpos + text_block_width ]) print_space_vpos = min([print_space_vpos, text_block_vpos]) print_space_hpos = min([print_space_hpos, text_block_hpos]) print_space_height = print_space_height - print_space_vpos print_space_width = print_space_width - print_space_hpos for l, line in enumerate(block.lines): if not line.transcription: continue text_line = ET.SubElement(text_block, "TextLine") text_line_baseline = int( np.average(np.array(line.baseline)[:, 1])) text_line.set("BASELINE", str(text_line_baseline)) text_line_vpos = min(np.array(line.polygon)[:, 1]) text_line.set("VPOS", str(text_line_vpos)) text_line_hpos = min(np.array(line.polygon)[:, 0]) text_line.set("HPOS", str(text_line_hpos)) text_line_height = max(np.array(line.polygon)[:, 1]) - min( np.array(line.polygon)[:, 1]) text_line.set("HEIGHT", str(text_line_height)) text_line_width = max(np.array(line.polygon)[:, 0]) - min( np.array(line.polygon)[:, 0]) text_line.set("WIDTH", str(text_line_width)) chars = [i for i in range(len(line.characters))] char_to_num = dict(zip(line.characters, chars)) label = [] for item in (line.transcription): label.append(char_to_num[item]) logits = line.get_dense_logits() output = softmax(logits, axis=1) aligned = force_align(-np.log(output), label, len(chars)) narrow_label(aligned, logits, len(chars)) crop_engine = EngineLineCropper(poly=2) line_coords = crop_engine.get_crop_inputs( line.baseline, line.heights, 16) global_letter_counter = 0 for w, word in enumerate(line.transcription.split()): local_letter_counter = 0 word_lenght = len(word) string_width = 0 string_hpos = 0 end_of_space = 0 final = False last = True for a, ali in enumerate(aligned): if ali != len(chars): if local_letter_counter > global_letter_counter: if final: end_of_space = 4 * a global_letter_counter = local_letter_counter last = False break if local_letter_counter - global_letter_counter == word_lenght: string_width = 4 * a - string_hpos final = True elif local_letter_counter - global_letter_counter == 0: string_hpos = 4 * a local_letter_counter += 1 if last: string_width = 4 * len(aligned) - string_hpos lm_const = np.shape(line_coords)[1] / (len(aligned) * 4) string = ET.SubElement(text_line, "String") string.set("CONTENT", word) string_hpos -= 1 all_x = line_coords[:, int(string_hpos * lm_const):int(string_hpos * lm_const) + int(string_width * lm_const), 0] all_y = line_coords[:, int(string_hpos * lm_const):int(string_hpos * lm_const) + int(string_width * lm_const), 1] string.set("HEIGHT", str(int(np.max(all_y) - np.min(all_y)))) string.set("WIDTH", str(int(np.max(all_x) - np.min(all_x)))) string.set("VPOS", str(int(np.min(all_y)))) string.set("HPOS", str(int(np.min(all_x)))) if w != (len(line.transcription.split()) - 1): space = ET.SubElement(text_line, "SP") all_x = line_coords[:, int((string_hpos + string_width) * lm_const):int((string_hpos + string_width) * lm_const) + int((end_of_space - (string_hpos + string_width)) * lm_const), 0] all_y = line_coords[:, int((string_hpos + string_width) * lm_const):int((string_hpos + string_width) * lm_const) + int((end_of_space - (string_hpos + string_width)) * lm_const), 1] space.set("WIDTH", str(int(np.max(all_x) - np.min(all_x)))) space.set("VPOS", str(int(np.min(all_y)))) space.set("HPOS", str(int(np.min(all_x)))) top_margin.set("HEIGHT", "{}".format(print_space_vpos)) top_margin.set("WIDTH", "{}".format(self.page_size[1])) top_margin.set("VPOS", "0") top_margin.set("HPOS", "0") left_margin.set("HEIGHT", "{}".format(self.page_size[0])) left_margin.set("WIDTH", "{}".format(print_space_hpos)) left_margin.set("VPOS", "0") left_margin.set("HPOS", "0") right_margin.set("HEIGHT", "{}".format(self.page_size[0])) right_margin.set( "WIDTH", "{}".format(self.page_size[1] - (print_space_hpos + print_space_width))) right_margin.set("VPOS", "0") right_margin.set("HPOS", "{}".format(print_space_hpos + print_space_width)) bottom_margin.set( "HEIGHT", "{}".format(self.page_size[0] - (print_space_vpos + print_space_height))) bottom_margin.set("WIDTH", "{}".format(self.page_size[1])) bottom_margin.set("VPOS", "{}".format(print_space_vpos + print_space_height)) bottom_margin.set("HPOS", "0") print_space.set("HEIGHT", str(print_space_height)) print_space.set("WIDTH", str(print_space_width)) print_space.set("VPOS", str(print_space_vpos)) print_space.set("HPOS", str(print_space_hpos)) return ET.tostring(root, pretty_print=True, encoding="utf-8").decode("utf-8")