def tesseract_and_clean(x): text = ocr.tesseract(x) # strip initial space if len(text) > 0 and text.startswith(" "): text = text[1:] # strip initial letter and space from start of longer text if len(text) > 10 and text[1] == " ": text = text[2:] text = ocr.clean_ocr_text(text, keep_newlines=True) return text
def test_tesseract(): eng = os.path.join(os.path.dirname(__file__), 'images', 'tesseract_eng_text.png') expected = 'A Python Approach to Character\nRecognition' assert ocr.tesseract(eng, config='') == expected assert ocr.tesseract(ocr.utils.to_cv2(eng), config='') == expected assert ocr.tesseract(ocr.utils.to_pil(eng), config='') == expected assert ocr.tesseract(ocr.utils.to_base64(eng), config='') == expected numbers = os.path.join(os.path.dirname(__file__), 'images', 'tesseract_numbers.jpg') expected = '619121' assert ocr.tesseract(numbers) == expected assert ocr.tesseract(ocr.utils.to_cv2(numbers)) == expected assert ocr.tesseract(ocr.utils.to_pil(numbers)) == expected assert ocr.tesseract(ocr.utils.to_base64(numbers)) == expected
def get_text_starting_at(self, start_x, image): if image.size[0] < 1: return "" if image.size[1] < 1: return "" if start_x >= image.size[0]: return "" cropimage = image.crop((start_x, 0, image.size[0], image.size[1])) text = ocr.tesseract(cropimage) try: text = text.strip() except: pass return text
def generate_transition_list_from_zones(self,image,regionlist,column_bounds,left,middle): """ given the pair of zone lists, generate a comprehensive list We should then be able to merge these sets of split information: anything where we find solid black or halftone is a definite break which may be followed either by another black or halftone area, by a description area, or by a vote area. """ ccontest_default = "No current contest" ccontest = ccontest_default cjurisdiction_default = "No current jurisdiction" cjurisdiction = cjurisdiction_default contest_instance = None for n in range(len(left)): this_y = left[n][0] try: next_zone = left[n+1] except IndexError: next_zone = [0,'X'] next_y = next_zone[0] rel_end = next_y - (const.dpi/10) if left[n][1]=='B' or left[n][1]=='G': self.log.debug("%s zone at %d to %d %s" % (left[n][1], this_y, next_y, next_zone)) # if it's a legitimage gray zone and the next zone is white, # that white zone is a voting area (or empty) if (next_y - this_y) > (const.dpi/4): crop = image.crop((column_bounds[0], this_y, column_bounds[1], next_y)) crop = Image.eval(crop,elim_halftone) cjurisdiction = ocr.tesseract(crop) cjurisdiction = cjurisdiction.replace("\n","//").strip() self.log.debug( "Jurisdiction %s" % (cjurisdiction,)) cjurisdiction = ocr.clean_ocr_text(cjurisdiction) self.log.debug( "Cleaned Jurisdiction %s" % (cjurisdiction,)) if left[n][1]=='W': self.get_title_and_votes_from(image,regionlist, (column_bounds[0], this_y, column_bounds[1], next_y)) self.log.debug( "White zone at %d to %d %s" % (this_y,next_y,next_zone)) # filter regionlist to contain only contests with choices regionlist = [x for x in regionlist if len(x.choices)>0] return regionlist
def upload(): for f in os.listdir(dir): if f != '.keep': os.remove(os.path.join(dir, f)) if request.method == "GET": return render_template('upload.html') elif request.method == "POST": uploaded_file = request.files['file'] filename = uploaded_file.filename if filename == '': return ('', 204) else: uploaded_file.save(os.path.join(dir, filename)) values = ocr.tesseract(dir + filename) if values == []: flash( """Tài liệu này chưa tuân thủ quy định biểu mẫu văn bản pháp luật! Xin hãy chọn tài liệu khác!""", "warning") return redirect(url_for('upload')) else: existed_doc = Doc.objects( number=values[1], code=values[2], title=values[5], ) if existed_doc: flash("Tài liệu đã tồn tại trên hệ thống!", "info") return render_template('upload.html', doc_id=existed_doc[0].id) else: new_doc = Doc( author=values[0], number=values[1], code=values[2], address=values[3], date=values[4], title=values[5], summary=values[6], ) with open(dir + filename, 'rb') as f: new_doc.file_upload = Binary(f.read()) new_doc.save() if values[0] not in authors: authors.append(values[0]) if values[3] not in addresses and values[ 3] != 'Không xác định': addresses.append(values[3]) if values[5] not in titles: titles.append(values[5]) return redirect(url_for('show', doc_id=new_doc.id))
def get_party_id(self): """ Return party id from class cache, or OCR it. """ def adj(f): return int(f * self.dpi) apzhoi = self.landmarks.ulc.x apzhoi += adj(const.party_zone_horiz_offset_inches) apzvoi = self.landmarks.ulc.y apzvoi += adj(const.party_zone_vert_offset_inches) croplist = (int(apzhoi), int(apzvoi), int(apzhoi) + adj(const.precinct_zone_width_inches), int(apzvoi) + adj(const.precinct_zone_height_inches)) pimage = self.image.crop(croplist) party = tesseract(pimage) BallotSide.party_cache[self.layout_id] = party return party
def get_precinct_id(self, page): """ get precinct id by ocr'ing ballot""" sixth = adj(1. / 6.) half = adj(1 / 2.) try: precinct = page.image.crop( (page.landmarks[3][0] + half, page.landmarks[3][1] - sixth, page.landmarks[3][0] + adj(2), page.landmarks[3][1])) precinct.save("/tmp/precinct.jpg") precincttext = ocr.tesseract(precinct) precincttext = ocr.clean_ocr_text(precincttext) precincttext = precincttext.strip() precincttext = precincttext.replace("\n", "//").strip() except IndexError: precincttext = "?" return precincttext
def get_precinct_id(self, page): """ get precinct id by ocr'ing ballot""" sixth = adj(1./6.) half = adj(1/2.) try: precinct = page.image.crop((page.landmarks[3][0] + half, page.landmarks[3][1] - sixth, page.landmarks[3][0] + adj(2), page.landmarks[3][1] )) precinct.save("/tmp/precinct.jpg") precincttext = ocr.tesseract(precinct) precincttext = ocr.clean_ocr_text(precincttext) precincttext = precincttext.strip() precincttext = precincttext.replace("\n","//").strip() except IndexError: precincttext = "?" return precincttext
def hart_build_contests(image, pot_hlines, vboxes, column_start, column_width, dpi=300,extensions=None): """Merge horiz lines and vote boxes to get contests and choice offsets.""" regionlist = [] contest_description_zones = [] last_contest = 0 first_above = 0 for vbox in vboxes: vbox_y = vbox[1] for hline in pot_hlines: if hline < vbox_y: first_above = hline if first_above != last_contest: last_contest = first_above contest_description_zones.append((first_above,vbox_y)) #print "Contest description zones",contest_description_zones for contest in contest_description_zones: # crop crop = image.crop((column_start, contest[0], column_start + column_width, contest[1])) # get text zonetext = ocr.tesseract(crop) zonetext = ocr.clean_ocr_text(zonetext) # create Contest, append to regionlist regionlist.append(Ballot.Contest(column_start, contest[0], column_start + column_width, contest[1], 0, zonetext)) contest_description_zones.reverse() for vbox in vboxes: # locate the last contest description zone above vbox # and assign vbox to that contest description zone for contest in contest_description_zones: # first contest above vbox gets vbox as choice if contest[0] < vbox[1]: #print "Vbox at",vbox[1],"in contest at",contest # crop area to right of vbox # get and clean text crop = image.crop((vbox[0] + dpi/3 + dpi/30, #!!! vbox[1] - dpi/100, #!!! vbox[0]+column_width-(dpi/2), #!!! vbox[1]+(dpi/2))) choice_text = ocr.tesseract(crop) #!!! # take only first line of choice choice_text = ocr.clean_ocr_text(choice_text).split("/")[0] # search regionlist for matching Contest, append #match.append(Ballot.Choice(...,choice_text)) for rcontest in regionlist: if rcontest.y == contest[0] and rcontest.x == column_start: rcontest.append(Ballot.Choice( vbox[0], vbox[1], choice_text) ) break break logger = logging.getLogger('') for contest in regionlist: logger.info("%d %d %s" % (contest.x, contest.y, contest.description)) for choice in contest.choices: logger.info(" %d %d %s" % (choice.x, choice.y, choice.description)) return regionlist
def get_contests_and_votes_from(self,image,regionlist,croplist): """ given an area known to contain votes and desc text, return info The cropped area will contain contest descriptions and voting areas. Unfortunately, the contest descriptions are not indented away from the oval voting areas. So... we crop looking for white line splits, and then treat every line as either part of a contest or as a vote line, depending on whether we find a pattern of white indicating the line contains only an oval and a single word, YES or NO. """ ov_off = adj(const.vote_target_horiz_offset_inches) ov_end = ov_off + adj(const.target_width_inches) txt_off = adj(const.candidate_text_horiz_offset_inches) contests = [] contest_string = "" crop = image.crop(croplist) # indent by 1/10" to avoid edges, then crop single pixel lines, # finding beginning and end of zones which include dark pixels # now check each dark zone to see if it is a vote op # or if it is descriptive text; vote ops will have an oval # in the oval channel beginning at ov_off # and extending until ov_end dark_zones = self.get_dark_zones(crop,dark_intensity=160) contest_created = False for dz in dark_zones: zonecrop1 = crop.crop((const.dpi/10, dz[0], crop.size[0]-(const.dpi/10), dz[1])) zonecrop2 = crop.crop((ov_end, dz[0], txt_off, dz[1])) zone2stat = ImageStat.Stat(zonecrop2) zonecrop3 = crop.crop((txt_off, dz[0], txt_off + const.dpi, dz[1])) zone1text = ocr.tesseract(zonecrop1) zone1text = ocr.clean_ocr_text(zone1text) zone3text = ocr.tesseract(zonecrop3) zone3text = ocr.clean_ocr_text(zone3text) intensity_suggests_voteop = False length_suggests_voteop = False if zone2stat.mean[0]>244: intensity_suggests_voteop = True if len(zone3text)<6: length_suggests_voteop = True if not intensity_suggests_voteop and not length_suggests_voteop: contest_created = False contest_string += zone1text.replace("\n","/") elif intensity_suggests_voteop and length_suggests_voteop: # create contest if none created, then if not contest_created: contest_created = True self.log.debug("Creating contest %s" % (contest_string,)) regionlist.append(Ballot.Contest(croplist[0], croplist[1]+dz[0], croplist[2], croplist[1]+dz[1], 0, contest_string)) contest_string = "" # add voteop to contest choice_string = zone3text self.log.debug("Adding choice %s" % (choice_string,)) regionlist[-1].append( Ballot.Choice( croplist[0]+ov_off, croplist[1]+ dz[0], choice_string ) ) else: if contest_created: contest_string += zone1text.replace("\n","//") else: self.log.debug( "Problem determining whether contest or choice") self.log.debug("Gap mean values %s" % (zone2stat.mean,)) self.log.debug("Zone3 text %s" % (zone3text,)) self.log.debug("Contest string: %s" % (contest_string,)) return dark_zones
def get_title_and_votes_from(self, image, regionlist, croplist, last_title="NO TITLE"): """ given an area known to contain contest title and votes, return info The cropped area will contain a title area at the top, followed by voting areas. Voting areas will contain ovals in the oval column. Descriptive text to the right of the ovals will be assigned to each oval based on being at or below the oval. """ ov_off = adj(const.vote_target_horiz_offset_inches) ov_ht = adj(const.target_height_inches) ov_wd = adj(const.target_width_inches) ov_end = ov_off + ov_wd txt_off = adj(const.candidate_text_horiz_offset_inches) choices = [] crop = image.crop(croplist) if croplist[2] == 0 or croplist[3] == 0: return [] dark_zones = self.get_dark_zones(crop) next_dark_zones = dark_zones[1:] next_dark_zones.append([crop.size[1] - 2, crop.size[1] - 1]) skipcount = 0 # for each dark zone, determine the first dark x encountered_oval = False dzstyle = [] for dz in dark_zones: # crop each dark strip # losing the area to the left of the possible vote target # and an equivalent area on the right dzcrop = crop.crop((ov_off, dz[0], crop.size[0] - ov_off, dz[1])) firstx = dzcrop.size[0] lastx = 0 for y in range(dzcrop.size[1]): for x in range(dzcrop.size[0]): p0 = dzcrop.getpixel((x, y)) if p0[0] < 192: firstx = min(firstx, x) lastx = max(lastx, x) lastxindent = dzcrop.size[0] - lastx # unfortunately, it is hard to tell a filled oval from a title # that begins about the same x offset as ovals; we will # recognize that titles come first and are symmetric # ovals start at a defined offset and will have a minimum height # and, if empty, will match a particular dark/light pattern symmetric = (abs(firstx - lastxindent) < adj(0.05)) tall_enough = (dz[1] - dz[0] >= int(ov_ht * .8)) ov_pat = oval_pattern(dzcrop, ov_ht, ov_wd, txt_off - ov_off) if not encountered_oval and not ov_pat: dzstyle.append("T") elif tall_enough and firstx <= adj(0.02): dzstyle.append("V") encountered_oval = True elif ((firstx >= (txt_off - ov_off - adj(0.02))) and not tall_enough): dzstyle.append("W") else: dzstyle.append("-") contest_instance = None choice = None title_array = [] contest_created = False for index, style in enumerate(dzstyle): if style == "T": titlezone = crop.crop( (adj(0.1), dark_zones[index][0], crop.size[0] - adj(0.1), dark_zones[index][1])) zonetext = ocr.tesseract(titlezone) zonetext = ocr.clean_ocr_text(zonetext) zonetext = zonetext.strip() zonetext = zonetext.replace("\n", "//").strip() title_array.append(zonetext) elif style == "V": if title_array is not None: zonetext = "/".join(title_array) title_array = None if len(zonetext) < 4: zonetext = last_title contest_instance = Ballot.Contest(croplist[0], croplist[1], croplist[2], croplist[3], 0, zonetext[:80]) contest_created = True regionlist.append(contest_instance) if not contest_created: print "WARNING: Choice but no contest." pdb.set_trace() continue choicezone = crop.crop( (txt_off, dark_zones[index][0], crop.size[0] - adj(0.1), dark_zones[index][1])) zonetext = ocr.tesseract(choicezone) zonetext = ocr.clean_ocr_text(zonetext) zonetext = zonetext.strip() zonetext = zonetext.replace("\n", "//").strip() # find the y at which the actual oval begins # which may be lower than the dark_zone start choice_y = dark_zones[index][0] # Look up to 0.2 inches beneath beginning of dark zone # for an oval darkening the oval region contig = 0 for adj_y in range(adj(0.2)): ovalcrop = crop.crop((ov_off, choice_y + adj_y, ov_end, choice_y + adj_y + 1)) ovalstat = ImageStat.Stat(ovalcrop) if ovalstat.extrema[0][0] < 240: contig += 1 if contig > adj(0.03): choice_y += (adj_y - adj(0.03)) found = True break else: contig = 0 choice = Ballot.Choice(croplist[0] + ov_off, croplist[1] + choice_y, zonetext) contest_instance.append(choice) #if zonetext.startswith("Randy"): # print "Randy" # pdb.set_trace() # print "Randy" elif style == "W" and len(dzstyle) > ( index + 1) and dzstyle[index + 1] in "W-": if title_array is not None: title_array = None try: choice.description = "Writein" except: pass return regionlist
def get_contests_and_votes_from(self, image, regionlist, croplist): """ given an area known to contain votes and desc text, return info The cropped area will contain contest descriptions and voting areas. Unfortunately, the contest descriptions are not indented away from the oval voting areas. So... we crop looking for white line splits, and then treat every line as either part of a contest or as a vote line, depending on whether we find a pattern of white indicating the line contains only an oval and a single word, YES or NO. """ ov_off = adj(const.vote_target_horiz_offset_inches) ov_end = ov_off + adj(const.target_width_inches) txt_off = adj(const.candidate_text_horiz_offset_inches) contests = [] contest_string = "" crop = image.crop(croplist) # indent by 1/10" to avoid edges, then crop single pixel lines, # finding beginning and end of zones which include dark pixels # now check each dark zone to see if it is a vote op # or if it is descriptive text; vote ops will have an oval # in the oval channel beginning at ov_off # and extending until ov_end dark_zones = self.get_dark_zones(crop, dark_intensity=160) contest_created = False for dz in dark_zones: zonecrop1 = crop.crop((const.dpi / 10, dz[0], crop.size[0] - (const.dpi / 10), dz[1])) zonecrop2 = crop.crop((ov_end, dz[0], txt_off, dz[1])) zone2stat = ImageStat.Stat(zonecrop2) zonecrop3 = crop.crop((txt_off, dz[0], txt_off + const.dpi, dz[1])) zone1text = ocr.tesseract(zonecrop1) zone1text = ocr.clean_ocr_text(zone1text) zone3text = ocr.tesseract(zonecrop3) zone3text = ocr.clean_ocr_text(zone3text) intensity_suggests_voteop = False length_suggests_voteop = False if zone2stat.mean[0] > 244: intensity_suggests_voteop = True if len(zone3text) < 6: length_suggests_voteop = True if not intensity_suggests_voteop and not length_suggests_voteop: contest_created = False contest_string += zone1text.replace("\n", "/") elif intensity_suggests_voteop and length_suggests_voteop: # create contest if none created, then if not contest_created: contest_created = True self.log.debug("Creating contest %s" % (contest_string, )) regionlist.append( Ballot.Contest(croplist[0], croplist[1] + dz[0], croplist[2], croplist[1] + dz[1], 0, contest_string)) contest_string = "" # add voteop to contest choice_string = zone3text self.log.debug("Adding choice %s" % (choice_string, )) regionlist[-1].append( Ballot.Choice(croplist[0] + ov_off, croplist[1] + dz[0], choice_string)) else: if contest_created: contest_string += zone1text.replace("\n", "//") else: self.log.debug( "Problem determining whether contest or choice") self.log.debug("Gap mean values %s" % (zone2stat.mean, )) self.log.debug("Zone3 text %s" % (zone3text, )) self.log.debug("Contest string: %s" % (contest_string, )) return dark_zones
def target_and_text(targetlist, textlist, textimage, debug=False): """associate each zone in textlist with best zone in first list Best is defined as follows: * text goes to the target that begins at or above the end of the text plus 1/8 the text's height and is closest """ target_text = {} target_text_code = {} try: targetlist.sort(key=lambda a: -a[0]) except KeyError: print "KeyError trying to sort targetlist" #pdb.set_trace() pass textlist.sort() try: textlist = map(lambda a: list(a), textlist) except KeyError: print "KeyError trying to map textlist" #pdb.set_trace() pass if debug: print "Debug in target_and_text" print targetlist pdb.set_trace() img_num = 0 for text in textlist: text_begin, text_end, eighth_text_height = text[0], text[1], ( text[1] - text[0]) / 8 # skip text not tall enough # pdb.set_trace() if text_end < (text_begin + (const.dpi / 20)): continue print "Checking targets against text beginnning %d ending %d" % ( text_begin, text_end) for target in targetlist: code_strings = [] #print "Target",target #pdb.set_trace() # skip targets not tall enough if target[1] <= (target[0] + (const.dpi / 20)): continue print "Target begins at %d ends at %d" % (target[0], target[1]) target_begin = target[0] if (text_end + eighth_text_height) >= target_begin: if text_end > text_begin: # in addition to ocr of the text, # we also want a text-code that will # allow us to merge OCR misreads. # for starters, let's try getting # the distance between mostly dark # and mostly light stripes on the # text line cropped_image = textimage.crop( (0, text_begin, textimage.size[0], text_end)) cropped_image.save("/tmp/cropped_image%d.jpg" % (img_num, )) img_num += 1 # pdb.set_trace() text_ocr = tesseract(cropped_image) text_ocr = clean_ocr_text(text_ocr.strip()) try: code_string = text_code(cropped_image) except Exception, e: print "Could not get code string, using '?'", print e code_string = "?" else: text_ocr = "" code_string = "0" text.append(text_ocr) code_strings.append(code_string) if target in target_text: target_text[target].append(text) target_text_code[target].append(code_strings) else: target_text[target] = [] target_text[target].append(text) target_text_code[target] = [] target_text_code[target].append(code_strings) break
def get_title_and_votes_from(self,image,regionlist,croplist,last_title="NO TITLE"): """ given an area known to contain contest title and votes, return info The cropped area will contain a title area at the top, followed by voting areas. Voting areas will contain ovals in the oval column. Descriptive text to the right of the ovals will be assigned to each oval based on being at or below the oval. """ ov_off = adj(const.vote_target_horiz_offset_inches) ov_ht = adj(const.target_height_inches) ov_wd = adj(const.target_width_inches) ov_end = ov_off + ov_wd txt_off = adj(const.candidate_text_horiz_offset_inches) choices = [] crop = image.crop(croplist) if croplist[2]==0 or croplist[3]==0: return [] dark_zones = self.get_dark_zones(crop) next_dark_zones = dark_zones[1:] next_dark_zones.append([crop.size[1]-2,crop.size[1]-1]) skipcount = 0 # for each dark zone, determine the first dark x encountered_oval = False dzstyle = [] for dz in dark_zones: # crop each dark strip # losing the area to the left of the possible vote target # and an equivalent area on the right dzcrop = crop.crop((ov_off, dz[0], crop.size[0]-ov_off, dz[1])) firstx = dzcrop.size[0] lastx = 0 for y in range(dzcrop.size[1]): for x in range(dzcrop.size[0]): p0 = dzcrop.getpixel((x,y)) if p0[0] < 192: firstx = min(firstx,x) lastx = max(lastx,x) lastxindent = dzcrop.size[0]-lastx # unfortunately, it is hard to tell a filled oval from a title # that begins about the same x offset as ovals; we will # recognize that titles come first and are symmetric # ovals start at a defined offset and will have a minimum height # and, if empty, will match a particular dark/light pattern symmetric = (abs(firstx-lastxindent) < adj(0.05)) tall_enough = (dz[1]-dz[0] >= int(ov_ht * .8)) ov_pat = oval_pattern(dzcrop,ov_ht,ov_wd,txt_off-ov_off) if not encountered_oval and not ov_pat: dzstyle.append("T") elif tall_enough and firstx <= adj(0.02): dzstyle.append("V") encountered_oval = True elif ((firstx >= (txt_off - ov_off - adj(0.02))) and not tall_enough): dzstyle.append("W") else: dzstyle.append("-") contest_instance = None choice = None title_array = [] contest_created = False for index,style in enumerate(dzstyle): if style=="T": titlezone = crop.crop((adj(0.1), dark_zones[index][0], crop.size[0]-adj(0.1), dark_zones[index][1])) zonetext = ocr.tesseract(titlezone) zonetext = ocr.clean_ocr_text(zonetext) zonetext = zonetext.strip() zonetext = zonetext.replace("\n","//").strip() title_array.append(zonetext) elif style=="V": if title_array is not None: zonetext = "/".join(title_array) title_array = None if len(zonetext) < 4:zonetext = last_title contest_instance = Ballot.Contest(croplist[0], croplist[1], croplist[2], croplist[3], 0, zonetext[:80]) contest_created = True regionlist.append(contest_instance) if not contest_created: print "WARNING: Choice but no contest." pdb.set_trace() continue choicezone = crop.crop((txt_off, dark_zones[index][0], crop.size[0]-adj(0.1), dark_zones[index][1])) zonetext = ocr.tesseract(choicezone) zonetext = ocr.clean_ocr_text(zonetext) zonetext = zonetext.strip() zonetext = zonetext.replace("\n","//").strip() # find the y at which the actual oval begins # which may be lower than the dark_zone start choice_y = dark_zones[index][0] # Look up to 0.2 inches beneath beginning of dark zone # for an oval darkening the oval region contig = 0 for adj_y in range(adj(0.2)): ovalcrop = crop.crop((ov_off, choice_y+adj_y, ov_end, choice_y+adj_y+1)) ovalstat = ImageStat.Stat(ovalcrop) if ovalstat.extrema[0][0] < 240: contig += 1 if contig > adj(0.03): choice_y += (adj_y-adj(0.03)) found = True break else: contig = 0 choice = Ballot.Choice(croplist[0]+ov_off, croplist[1]+choice_y, zonetext) contest_instance.append(choice) #if zonetext.startswith("Randy"): # print "Randy" # pdb.set_trace() # print "Randy" elif style=="W" and len(dzstyle)>(index+1) and dzstyle[index+1] in "W-": if title_array is not None: title_array = None try: choice.description = "Writein" except: pass return regionlist
def hart_build_contests(image, pot_hlines, vboxes, column_start, column_width, dpi=300, extensions=None): """Merge horiz lines and vote boxes to get contests and choice offsets.""" regionlist = [] contest_description_zones = [] last_contest = 0 first_above = 0 for vbox in vboxes: vbox_y = vbox[1] for hline in pot_hlines: if hline < vbox_y: first_above = hline if first_above != last_contest: last_contest = first_above contest_description_zones.append((first_above, vbox_y)) #print "Contest description zones",contest_description_zones for contest in contest_description_zones: # crop crop = image.crop((column_start, contest[0], column_start + column_width, contest[1])) # get text zonetext = ocr.tesseract(crop) zonetext = ocr.clean_ocr_text(zonetext) # create Contest, append to regionlist regionlist.append( Ballot.Contest(column_start, contest[0], column_start + column_width, contest[1], 0, zonetext)) contest_description_zones.reverse() for vbox in vboxes: # locate the last contest description zone above vbox # and assign vbox to that contest description zone for contest in contest_description_zones: # first contest above vbox gets vbox as choice if contest[0] < vbox[1]: #print "Vbox at",vbox[1],"in contest at",contest # crop area to right of vbox # get and clean text crop = image.crop(( vbox[0] + dpi / 3 + dpi / 30, #!!! vbox[1] - dpi / 100, #!!! vbox[0] + column_width - (dpi / 2), #!!! vbox[1] + (dpi / 2))) choice_text = ocr.tesseract(crop) #!!! # take only first line of choice choice_text = ocr.clean_ocr_text(choice_text).split("/")[0] # search regionlist for matching Contest, append #match.append(Ballot.Choice(...,choice_text)) for rcontest in regionlist: if rcontest.y == contest[0] and rcontest.x == column_start: rcontest.append( Ballot.Choice(vbox[0], vbox[1], choice_text)) break break logger = logging.getLogger('') for contest in regionlist: logger.info("%d %d %s" % (contest.x, contest.y, contest.description)) for choice in contest.choices: logger.info(" %d %d %s" % (choice.x, choice.y, choice.description)) return regionlist
def get_text_for_arrow_at(im, x, y, global_dpi): """use tesseract to retrieve text corresponding to left of arrow Text associated with different arrows is separated by horizontal lines. Find the y offsets of those lines and pass text between those offsets to tesseract, sending it a rectangle 2.25" wide from to the left of the arrow. The contest text is above a batch of arrows, and is separated from choice text by a thicker line. Text is run through ocr.clean_ocr_text and commas are deleted. Returns choice text, contest text, and crop rectangle for contest text. """ # find center of arrow iround = lambda x: int(round(x)) adj = lambda f: int(round(const.dpi * f)) fortieth = int(global_dpi / 40.) topline = int(y - fortieth) bottomline = int(y + int(global_dpi * .22)) startx = int(x - global_dpi) starty = int(y + int(global_dpi * .07)) for up in range(global_dpi / 4): solid_line = True for xadj in range(global_dpi / 4): pix = im.getpixel((startx + xadj, starty - up)) if pix[0] > 128: solid_line = False break if solid_line: topline = starty - up + 1 break for down in range(global_dpi / 3): solid_line = True for xadj in range(global_dpi / 4): pix = im.getpixel((startx + xadj, starty + down)) if pix[0] > 128: solid_line = False break if solid_line: bottomline = starty + down - 1 break # add one to accomodate rough top line topline += 1 # need to back up to beginning of column, now using 2.25 inches crop_x = x - (global_dpi * 2.25) crop_x = iround(crop_x) if crop_x < 0: crop_x = 0 if topline < 0: topline = 0 if bottomline <= topline: bottomline = topline + 1 if bottomline >= im.size[1]: bottomline = im.size[1] - 1 if crop_x < 0: crop_x = 0 if x <= crop_x: x = crop_x + 1 if x >= im.size[0]: x = im.size[0] - 1 crop = im.crop((int(crop_x), int(topline), int(x), int(bottomline))) text = ocr.tesseract(crop) # XXX self.extensions once in class text = ocr.clean_ocr_text(text) # XXX self.extensions once in class choice_topline = int(topline) # now repeat process but going up until thicker black; # that will be the top of the contest contig = 0 for up in range(global_dpi * 3): solid_line = True for xadj in range(global_dpi / 4): pix = im.getpixel((startx + xadj, topline - up)) if pix[0] > 128: solid_line = False contig = 0 break if solid_line: contig = contig + 1 if contig >= int(global_dpi / 60.): topline = topline - up + 1 break contest_croplist = (int(crop_x), int(topline), int(x), int(choice_topline)) crop = im.crop(contest_croplist) contest_text = ocr.tesseract(crop) # XXX self.extensions once in class contest_text = ocr.clean_ocr_text( contest_text) # XXX self.extensions once in class text = text.replace("\n", " ").strip() contest_text = contest_text.replace("\n", " ").replace(",", "").strip() return text, contest_text, contest_croplist
def get_text_for_arrow_at(im,x,y,global_dpi): """use tesseract to retrieve text corresponding to left of arrow Text associated with different arrows is separated by horizontal lines. Find the y offsets of those lines and pass text between those offsets to tesseract, sending it a rectangle 2.25" wide from to the left of the arrow. The contest text is above a batch of arrows, and is separated from choice text by a thicker line. Text is run through ocr.clean_ocr_text and commas are deleted. Returns choice text, contest text, and crop rectangle for contest text. """ # find center of arrow iround = lambda x: int(round(x)) adj = lambda f: int(round(const.dpi * f)) fortieth = int(global_dpi/40.) topline = int(y - fortieth) bottomline = int(y + int(global_dpi * .22)) startx = int(x - global_dpi) starty = int(y + int(global_dpi * .07)) for up in range(global_dpi/4): solid_line = True for xadj in range(global_dpi/4): pix = im.getpixel((startx+xadj,starty-up)) if pix[0]>128: solid_line = False break if solid_line: topline = starty-up+1 break for down in range(global_dpi/3): solid_line = True for xadj in range(global_dpi/4): pix = im.getpixel((startx+xadj,starty+down)) if pix[0]>128: solid_line = False break if solid_line: bottomline = starty+down-1 break # add one to accomodate rough top line topline += 1 # need to back up to beginning of column, now using 2.25 inches crop_x = x - (global_dpi*2.25) crop_x = iround(crop_x) if crop_x<0:crop_x = 0 if topline < 0: topline = 0 if bottomline <= topline: bottomline = topline + 1 if bottomline >= im.size[1]: bottomline = im.size[1]-1 if crop_x < 0: crop_x = 0 if x <= crop_x: x = crop_x + 1 if x >= im.size[0]: x = im.size[0] - 1 crop = im.crop((int(crop_x), int(topline), int(x), int(bottomline))) text = ocr.tesseract(crop) # XXX self.extensions once in class text = ocr.clean_ocr_text(text)# XXX self.extensions once in class choice_topline = int(topline) # now repeat process but going up until thicker black; # that will be the top of the contest contig = 0 for up in range(global_dpi*3): solid_line = True for xadj in range(global_dpi/4): pix = im.getpixel((startx+xadj,topline-up)) if pix[0]>128: solid_line = False contig = 0 break if solid_line: contig = contig + 1 if contig >= int(global_dpi/60.): topline = topline-up+1 break contest_croplist = (int(crop_x), int(topline), int(x), int(choice_topline ) ) crop = im.crop(contest_croplist) contest_text = ocr.tesseract(crop)# XXX self.extensions once in class contest_text = ocr.clean_ocr_text(contest_text)# XXX self.extensions once in class text = text.replace("\n"," ").strip() contest_text = contest_text.replace("\n"," ").replace(",","").strip() return text, contest_text, contest_croplist