def split(self, output_dir, **kwargs): """Split an image that contains a table into per-cell images""" for box in self.get_boxes(**kwargs): col, row, left, upper, right, lower = box region = self.im.crop((left, upper, right, lower)) region = self.transform(region, col, row) # Increase contrast. Whiteish regions become absolute white region = threshold(region, 200) output_filename = cell_basename(self.filename, col, row) + ".tiff" output_path = os.path.join(output_dir, output_filename) region.save(output_path) md5 = md5sum(output_path) split_image.send(self, input_filename=self.filename, input_md5=self.md5, filename=output_filename, md5=md5, column=col, row=row, left=left, upper=upper, right=right, lower=lower)
def ocr_image(self, input_filename, num_cols, num_rows, split_dir, ocr_dir, user): for col in range(num_cols): for row in range(num_rows): basename = cell_basename(input_filename, col, row) img_filename = basename + '.tiff' path = os.path.join(split_dir, img_filename) outbase = os.path.join(ocr_dir, basename) if os.path.exists(path): img_md5 = md5sum(path) cmd = "tesseract {} {}".format(path, outbase) cmd_args = shlex.split(cmd) subprocess.check_call(cmd_args) output_filename = outbase + '.txt' with open(output_filename, 'r') as f: text = f.read() image_text.send(self, source_filename=img_filename, source_md5=img_md5, method='ocr', text=text, user=user)