Exemplo n.º 1
0
 def report_extracted(self, input_filename, output_dir, project):
     input_filename_base = os.path.basename(input_filename)
     input_md5 = md5sum(input_filename)
     pathname = output_prefix(input_filename, output_dir) + "*.pgm"
     for fname in glob.glob(pathname):
         output_filename = os.path.basename(fname)
         output_md5 = md5sum(fname)
         page_num = self.page_num(output_filename)
         extract_image.send(self, input_filename=input_filename_base,
             input_md5=input_md5, filename=output_filename,
             md5=output_md5, page=page_num, project=project)
Exemplo n.º 2
0
 def split(self, output_dir, **kwargs):
     """Split an image that contains a table into per-cell images"""
     for box in self.get_boxes(**kwargs):
         col, row, left, upper, right, lower = box
         region = self.im.crop((left, upper, right, lower))
         region = self.transform(region, col, row)
         # Increase contrast.  Whiteish regions become absolute white
         region = threshold(region, 200)
         output_filename = cell_basename(self.filename, col, row) + ".tiff"
         output_path = os.path.join(output_dir, output_filename)
         region.save(output_path)
         md5 = md5sum(output_path)
         split_image.send(self, input_filename=self.filename,
             input_md5=self.md5, filename=output_filename, md5=md5,
             column=col, row=row, left=left, upper=upper, right=right,
             lower=lower)
Exemplo n.º 3
0
 def ocr_image(self, input_filename, num_cols, num_rows, split_dir, ocr_dir,
         user):
     for col in range(num_cols):
         for row in range(num_rows):
             basename = cell_basename(input_filename, col, row)
             img_filename = basename + '.tiff'
             path = os.path.join(split_dir, img_filename)
             outbase = os.path.join(ocr_dir, basename)
             if os.path.exists(path):
                 img_md5 = md5sum(path)
                 cmd = "tesseract {} {}".format(path, outbase)
                 cmd_args = shlex.split(cmd)
                 subprocess.check_call(cmd_args)
                 output_filename = outbase + '.txt'
                 with open(output_filename, 'r') as f:
                     text = f.read()
                     image_text.send(self, source_filename=img_filename, 
                         source_md5=img_md5, method='ocr',
                         text=text, user=user)
Exemplo n.º 4
0
 def __init__(self, filename):
     self.md5 = md5sum(filename)
     self.filename = os.path.basename(filename)
     self._im = Image.open(filename)