예제 #1
0
def tesseract_and_clean(x):
    text = ocr.tesseract(x)
    # strip initial space
    if len(text) > 0 and text.startswith(" "):
        text = text[1:]
    # strip initial letter and space from start of longer text
    if len(text) > 10 and text[1] == " ":
        text = text[2:]
    text = ocr.clean_ocr_text(text, keep_newlines=True)
    return text
예제 #2
0
def test_tesseract():
    eng = os.path.join(os.path.dirname(__file__), 'images', 'tesseract_eng_text.png')
    expected = 'A Python Approach to Character\nRecognition'
    assert ocr.tesseract(eng, config='') == expected
    assert ocr.tesseract(ocr.utils.to_cv2(eng), config='') == expected
    assert ocr.tesseract(ocr.utils.to_pil(eng), config='') == expected
    assert ocr.tesseract(ocr.utils.to_base64(eng), config='') == expected

    numbers = os.path.join(os.path.dirname(__file__), 'images', 'tesseract_numbers.jpg')
    expected = '619121'
    assert ocr.tesseract(numbers) == expected
    assert ocr.tesseract(ocr.utils.to_cv2(numbers)) == expected
    assert ocr.tesseract(ocr.utils.to_pil(numbers)) == expected
    assert ocr.tesseract(ocr.utils.to_base64(numbers)) == expected
예제 #3
0
파일: decompose.py 프로젝트: wrishel/tevs
 def get_text_starting_at(self, start_x, image):
     if image.size[0] < 1:
         return ""
     if image.size[1] < 1:
         return ""
     if start_x >= image.size[0]:
         return ""
     cropimage = image.crop((start_x, 0, image.size[0], image.size[1]))
     text = ocr.tesseract(cropimage)
     try:
         text = text.strip()
     except:
         pass
     return text
예제 #4
0
    def generate_transition_list_from_zones(self,image,regionlist,column_bounds,left,middle):
        """ given the pair of zone lists, generate a comprehensive list

        We should then be able to merge these sets of split information:
        anything where we find solid black or halftone is a definite break
        which may be followed either by another black or halftone area, by
        a description area, or by a vote area.
        """
        ccontest_default = "No current contest"
        ccontest = ccontest_default
        cjurisdiction_default = "No current jurisdiction"
        cjurisdiction = cjurisdiction_default
        contest_instance = None
        for n in range(len(left)):
            this_y = left[n][0]
            try:
                next_zone = left[n+1]
            except IndexError:
                next_zone = [0,'X']
            next_y = next_zone[0]
            rel_end = next_y - (const.dpi/10)
            if left[n][1]=='B' or left[n][1]=='G':
                self.log.debug("%s zone at %d to %d %s" % (left[n][1],
                                                           this_y,
                                                           next_y,
                                                           next_zone))
                # if it's a legitimage gray zone and the next zone is white,
                # that white zone is a voting area (or empty)
                if (next_y - this_y) > (const.dpi/4):
                    crop = image.crop((column_bounds[0],
                                       this_y,
                                       column_bounds[1],
                                       next_y))
                    crop = Image.eval(crop,elim_halftone)
                    cjurisdiction = ocr.tesseract(crop)
                    cjurisdiction = cjurisdiction.replace("\n","//").strip()
                    self.log.debug( "Jurisdiction %s" % (cjurisdiction,))
                    cjurisdiction = ocr.clean_ocr_text(cjurisdiction)
                    self.log.debug( "Cleaned Jurisdiction %s" % (cjurisdiction,))
            if left[n][1]=='W':
                self.get_title_and_votes_from(image,regionlist,
                                         (column_bounds[0],
                                          this_y,
                                          column_bounds[1],
                                          next_y))
                self.log.debug( "White zone at %d to %d %s" % (this_y,next_y,next_zone))
        # filter regionlist to contain only contests with choices
        regionlist = [x for x in regionlist if len(x.choices)>0]
        return regionlist
예제 #5
0
파일: ess1_ballot.py 프로젝트: wrishel/tevs
    def generate_transition_list_from_zones(self,image,regionlist,column_bounds,left,middle):
        """ given the pair of zone lists, generate a comprehensive list

        We should then be able to merge these sets of split information:
        anything where we find solid black or halftone is a definite break
        which may be followed either by another black or halftone area, by
        a description area, or by a vote area.
        """
        ccontest_default = "No current contest"
        ccontest = ccontest_default
        cjurisdiction_default = "No current jurisdiction"
        cjurisdiction = cjurisdiction_default
        contest_instance = None
        for n in range(len(left)):
            this_y = left[n][0]
            try:
                next_zone = left[n+1]
            except IndexError:
                next_zone = [0,'X']
            next_y = next_zone[0]
            rel_end = next_y - (const.dpi/10)
            if left[n][1]=='B' or left[n][1]=='G':
                self.log.debug("%s zone at %d to %d %s" % (left[n][1],
                                                           this_y,
                                                           next_y,
                                                           next_zone))
                # if it's a legitimage gray zone and the next zone is white,
                # that white zone is a voting area (or empty)
                if (next_y - this_y) > (const.dpi/4):
                    crop = image.crop((column_bounds[0],
                                       this_y,
                                       column_bounds[1],
                                       next_y))
                    crop = Image.eval(crop,elim_halftone)
                    cjurisdiction = ocr.tesseract(crop)
                    cjurisdiction = cjurisdiction.replace("\n","//").strip()
                    self.log.debug( "Jurisdiction %s" % (cjurisdiction,))
                    cjurisdiction = ocr.clean_ocr_text(cjurisdiction)
                    self.log.debug( "Cleaned Jurisdiction %s" % (cjurisdiction,))
            if left[n][1]=='W':
                self.get_title_and_votes_from(image,regionlist,
                                         (column_bounds[0],
                                          this_y,
                                          column_bounds[1],
                                          next_y))
                self.log.debug( "White zone at %d to %d %s" % (this_y,next_y,next_zone))
        # filter regionlist to contain only contests with choices
        regionlist = [x for x in regionlist if len(x.choices)>0]
        return regionlist
예제 #6
0
def upload():
    for f in os.listdir(dir):
        if f != '.keep':
            os.remove(os.path.join(dir, f))
    if request.method == "GET":
        return render_template('upload.html')
    elif request.method == "POST":
        uploaded_file = request.files['file']
        filename = uploaded_file.filename
        if filename == '':
            return ('', 204)
        else:
            uploaded_file.save(os.path.join(dir, filename))
            values = ocr.tesseract(dir + filename)
            if values == []:
                flash(
                    """Tài liệu này chưa tuân thủ quy định biểu mẫu văn bản pháp luật!
                                    Xin hãy chọn tài liệu khác!""", "warning")
                return redirect(url_for('upload'))
            else:
                existed_doc = Doc.objects(
                    number=values[1],
                    code=values[2],
                    title=values[5],
                )
                if existed_doc:
                    flash("Tài liệu đã tồn tại trên hệ thống!", "info")
                    return render_template('upload.html',
                                           doc_id=existed_doc[0].id)
                else:
                    new_doc = Doc(
                        author=values[0],
                        number=values[1],
                        code=values[2],
                        address=values[3],
                        date=values[4],
                        title=values[5],
                        summary=values[6],
                    )
                    with open(dir + filename, 'rb') as f:
                        new_doc.file_upload = Binary(f.read())
                    new_doc.save()
                    if values[0] not in authors: authors.append(values[0])
                    if values[3] not in addresses and values[
                            3] != 'Không xác định':
                        addresses.append(values[3])
                    if values[5] not in titles: titles.append(values[5])
                    return redirect(url_for('show', doc_id=new_doc.id))
예제 #7
0
    def get_party_id(self):
        """ Return party id from class cache, or OCR it. """
        def adj(f):
            return int(f * self.dpi)

        apzhoi = self.landmarks.ulc.x
        apzhoi += adj(const.party_zone_horiz_offset_inches)
        apzvoi = self.landmarks.ulc.y
        apzvoi += adj(const.party_zone_vert_offset_inches)
        croplist = (int(apzhoi), int(apzvoi),
                    int(apzhoi) + adj(const.precinct_zone_width_inches),
                    int(apzvoi) + adj(const.precinct_zone_height_inches))
        pimage = self.image.crop(croplist)
        party = tesseract(pimage)
        BallotSide.party_cache[self.layout_id] = party
        return party
예제 #8
0
 def get_precinct_id(self, page):
     """ get precinct id by ocr'ing ballot"""
     sixth = adj(1. / 6.)
     half = adj(1 / 2.)
     try:
         precinct = page.image.crop(
             (page.landmarks[3][0] + half, page.landmarks[3][1] - sixth,
              page.landmarks[3][0] + adj(2), page.landmarks[3][1]))
         precinct.save("/tmp/precinct.jpg")
         precincttext = ocr.tesseract(precinct)
         precincttext = ocr.clean_ocr_text(precincttext)
         precincttext = precincttext.strip()
         precincttext = precincttext.replace("\n", "//").strip()
     except IndexError:
         precincttext = "?"
     return precincttext
예제 #9
0
 def get_precinct_id(self, page):
     """ get precinct id by ocr'ing ballot"""
     sixth = adj(1./6.)
     half = adj(1/2.)
     try:
         precinct = page.image.crop((page.landmarks[3][0] + half,
                                     page.landmarks[3][1] - sixth,
                                     page.landmarks[3][0] + adj(2),
                                     page.landmarks[3][1] ))
         precinct.save("/tmp/precinct.jpg")
         precincttext = ocr.tesseract(precinct)
         precincttext = ocr.clean_ocr_text(precincttext)
         precincttext = precincttext.strip()
         precincttext = precincttext.replace("\n","//").strip()
     except IndexError:
         precincttext = "?"
     return precincttext
def hart_build_contests(image, pot_hlines, vboxes, column_start, column_width, dpi=300,extensions=None):
    """Merge horiz lines and vote boxes to get contests and choice offsets."""
    regionlist = []
    contest_description_zones = []
    last_contest = 0
    first_above = 0
    for vbox in vboxes:
        vbox_y = vbox[1]
        for hline in pot_hlines:
            if hline < vbox_y:
                first_above = hline
        if first_above != last_contest:
            last_contest = first_above
            contest_description_zones.append((first_above,vbox_y))
    #print "Contest description zones",contest_description_zones
    for contest in contest_description_zones:
        # crop
        crop = image.crop((column_start,
                           contest[0],
                           column_start + column_width,
                           contest[1]))
        # get text
        zonetext = ocr.tesseract(crop)
        zonetext = ocr.clean_ocr_text(zonetext)
        # create Contest, append to regionlist
        regionlist.append(Ballot.Contest(column_start,
                                         contest[0],
                                         column_start + column_width,
                                         contest[1],
                                         0,
                                         zonetext))

    contest_description_zones.reverse()
    for vbox in vboxes:
        # locate the last contest description zone above vbox
        # and assign vbox to that contest description zone
        
        for contest in contest_description_zones:
            # first contest above vbox gets vbox as choice
            if contest[0] < vbox[1]:
                #print "Vbox at",vbox[1],"in contest at",contest
                # crop area to right of vbox
                # get and clean text
                crop = image.crop((vbox[0] + dpi/3 + dpi/30, #!!!
                                            vbox[1] - dpi/100, #!!!
                                            vbox[0]+column_width-(dpi/2), #!!!
                                            vbox[1]+(dpi/2)))
                choice_text = ocr.tesseract(crop) #!!!
                # take only first line of choice
                choice_text = ocr.clean_ocr_text(choice_text).split("/")[0]

                # search regionlist for matching Contest, append
                #match.append(Ballot.Choice(...,choice_text))
                for rcontest in regionlist:
                    if rcontest.y == contest[0] and rcontest.x == column_start:
                        rcontest.append(Ballot.Choice(
                                vbox[0],
                                vbox[1],
                                choice_text)
                                        )
                        break
                break
    logger = logging.getLogger('')
    for contest in regionlist:
        logger.info("%d %d %s" % (contest.x, contest.y, contest.description))
        for choice in contest.choices:
            logger.info(" %d %d %s" % (choice.x, choice.y, choice.description))

    return regionlist
예제 #11
0
    def get_contests_and_votes_from(self,image,regionlist,croplist):
        """ given an area known to contain votes and desc text, return info

        The cropped area will contain contest descriptions and voting areas.
        Unfortunately, the contest descriptions are not indented away from
        the oval voting areas.  So...  we crop looking for white line splits,
        and then treat every line as either part of a contest or as a vote
        line, depending on whether we find a pattern of white indicating
        the line contains only an oval and a single word, YES or NO.
        """
        ov_off = adj(const.vote_target_horiz_offset_inches)
        ov_end = ov_off + adj(const.target_width_inches)

        txt_off = adj(const.candidate_text_horiz_offset_inches)

        contests = []
        contest_string = ""
        crop = image.crop(croplist)
        # indent by 1/10" to avoid edges, then crop single pixel lines,
        # finding beginning and end of zones which include dark pixels
        # now check each dark zone to see if it is a vote op 
        # or if it is descriptive text; vote ops will have an oval
        # in the oval channel beginning at ov_off
        # and extending until ov_end
        dark_zones = self.get_dark_zones(crop,dark_intensity=160)
        contest_created = False
        for dz in dark_zones:
            zonecrop1 = crop.crop((const.dpi/10,
                                    dz[0],
                                    crop.size[0]-(const.dpi/10), 
                                    dz[1]))
            zonecrop2 = crop.crop((ov_end,
                                    dz[0],
                                    txt_off, 
                                    dz[1]))
            zone2stat = ImageStat.Stat(zonecrop2)
            zonecrop3 = crop.crop((txt_off,
                                    dz[0],
                                    txt_off + const.dpi,
                                    dz[1]))
            zone1text = ocr.tesseract(zonecrop1)
            zone1text = ocr.clean_ocr_text(zone1text)
            zone3text = ocr.tesseract(zonecrop3)
            zone3text = ocr.clean_ocr_text(zone3text)
            intensity_suggests_voteop = False
            length_suggests_voteop = False
            if zone2stat.mean[0]>244: intensity_suggests_voteop = True
            if len(zone3text)<6: length_suggests_voteop = True
            if not intensity_suggests_voteop and not length_suggests_voteop:
                contest_created = False
                contest_string += zone1text.replace("\n","/")
            elif intensity_suggests_voteop and length_suggests_voteop:
                # create contest if none created, then
                if not contest_created:
                    contest_created = True
                    self.log.debug("Creating contest %s" % (contest_string,))
                    regionlist.append(Ballot.Contest(croplist[0],
                                                     croplist[1]+dz[0],
                                                     croplist[2],
                                                     croplist[1]+dz[1],
                                                     0,
                                                     contest_string))
                    contest_string = ""
                # add voteop to contest
                choice_string = zone3text
                self.log.debug("Adding choice %s" % (choice_string,))
                regionlist[-1].append(
                    Ballot.Choice(
                        croplist[0]+ov_off,
                        croplist[1]+ dz[0],
                        choice_string
                        )
                    )

            else:
                if contest_created:
                    contest_string += zone1text.replace("\n","//")
                else:
                    self.log.debug( "Problem determining whether contest or choice")
                    self.log.debug("Gap mean values %s" % (zone2stat.mean,))
                    self.log.debug("Zone3 text %s" % (zone3text,))
                    self.log.debug("Contest string: %s" % (contest_string,))
        return dark_zones
예제 #12
0
    def get_title_and_votes_from(self,
                                 image,
                                 regionlist,
                                 croplist,
                                 last_title="NO TITLE"):
        """ given an area known to contain contest title and votes, return info

        The cropped area will contain a title area at the top, 
        followed by voting areas.  Voting areas will
        contain ovals in the oval column.  Descriptive text to the right of
        the ovals will be assigned to each oval based on being at or below
        the oval.

        """
        ov_off = adj(const.vote_target_horiz_offset_inches)
        ov_ht = adj(const.target_height_inches)
        ov_wd = adj(const.target_width_inches)
        ov_end = ov_off + ov_wd
        txt_off = adj(const.candidate_text_horiz_offset_inches)

        choices = []
        crop = image.crop(croplist)
        if croplist[2] == 0 or croplist[3] == 0:
            return []

        dark_zones = self.get_dark_zones(crop)

        next_dark_zones = dark_zones[1:]
        next_dark_zones.append([crop.size[1] - 2, crop.size[1] - 1])
        skipcount = 0

        # for each dark zone, determine the first dark x
        encountered_oval = False
        dzstyle = []
        for dz in dark_zones:
            # crop each dark strip
            # losing the area to the left of the possible vote target
            # and an equivalent area on the right
            dzcrop = crop.crop((ov_off, dz[0], crop.size[0] - ov_off, dz[1]))

            firstx = dzcrop.size[0]
            lastx = 0
            for y in range(dzcrop.size[1]):
                for x in range(dzcrop.size[0]):
                    p0 = dzcrop.getpixel((x, y))
                    if p0[0] < 192:
                        firstx = min(firstx, x)
                        lastx = max(lastx, x)
            lastxindent = dzcrop.size[0] - lastx

            # unfortunately, it is hard to tell a filled oval from a title
            # that begins about the same x offset as ovals; we will
            # recognize that titles come first and are symmetric
            # ovals start at a defined offset and will have a minimum height
            # and, if empty, will match a particular dark/light pattern
            symmetric = (abs(firstx - lastxindent) < adj(0.05))
            tall_enough = (dz[1] - dz[0] >= int(ov_ht * .8))

            ov_pat = oval_pattern(dzcrop, ov_ht, ov_wd, txt_off - ov_off)

            if not encountered_oval and not ov_pat:
                dzstyle.append("T")

            elif tall_enough and firstx <= adj(0.02):
                dzstyle.append("V")
                encountered_oval = True

            elif ((firstx >= (txt_off - ov_off - adj(0.02)))
                  and not tall_enough):
                dzstyle.append("W")
            else:
                dzstyle.append("-")

        contest_instance = None
        choice = None
        title_array = []
        contest_created = False
        for index, style in enumerate(dzstyle):
            if style == "T":
                titlezone = crop.crop(
                    (adj(0.1), dark_zones[index][0], crop.size[0] - adj(0.1),
                     dark_zones[index][1]))
                zonetext = ocr.tesseract(titlezone)
                zonetext = ocr.clean_ocr_text(zonetext)
                zonetext = zonetext.strip()
                zonetext = zonetext.replace("\n", "//").strip()
                title_array.append(zonetext)
            elif style == "V":
                if title_array is not None:
                    zonetext = "/".join(title_array)
                    title_array = None
                    if len(zonetext) < 4: zonetext = last_title
                    contest_instance = Ballot.Contest(croplist[0], croplist[1],
                                                      croplist[2], croplist[3],
                                                      0, zonetext[:80])
                    contest_created = True
                    regionlist.append(contest_instance)
                if not contest_created:
                    print "WARNING: Choice but no contest."
                    pdb.set_trace()
                    continue
                choicezone = crop.crop(
                    (txt_off, dark_zones[index][0], crop.size[0] - adj(0.1),
                     dark_zones[index][1]))
                zonetext = ocr.tesseract(choicezone)
                zonetext = ocr.clean_ocr_text(zonetext)
                zonetext = zonetext.strip()
                zonetext = zonetext.replace("\n", "//").strip()

                # find the y at which the actual oval begins
                # which may be lower than the dark_zone start
                choice_y = dark_zones[index][0]

                # Look up to 0.2 inches beneath beginning of dark zone
                # for an oval darkening the oval region
                contig = 0
                for adj_y in range(adj(0.2)):
                    ovalcrop = crop.crop((ov_off, choice_y + adj_y, ov_end,
                                          choice_y + adj_y + 1))
                    ovalstat = ImageStat.Stat(ovalcrop)
                    if ovalstat.extrema[0][0] < 240:
                        contig += 1
                        if contig > adj(0.03):
                            choice_y += (adj_y - adj(0.03))
                            found = True
                            break
                    else:
                        contig = 0

                choice = Ballot.Choice(croplist[0] + ov_off,
                                       croplist[1] + choice_y, zonetext)
                contest_instance.append(choice)
                #if zonetext.startswith("Randy"):
                #    print "Randy"
                #    pdb.set_trace()
                #    print "Randy"
            elif style == "W" and len(dzstyle) > (
                    index + 1) and dzstyle[index + 1] in "W-":
                if title_array is not None:
                    title_array = None

                try:
                    choice.description = "Writein"
                except:
                    pass
        return regionlist
예제 #13
0
    def get_contests_and_votes_from(self, image, regionlist, croplist):
        """ given an area known to contain votes and desc text, return info

        The cropped area will contain contest descriptions and voting areas.
        Unfortunately, the contest descriptions are not indented away from
        the oval voting areas.  So...  we crop looking for white line splits,
        and then treat every line as either part of a contest or as a vote
        line, depending on whether we find a pattern of white indicating
        the line contains only an oval and a single word, YES or NO.
        """
        ov_off = adj(const.vote_target_horiz_offset_inches)
        ov_end = ov_off + adj(const.target_width_inches)

        txt_off = adj(const.candidate_text_horiz_offset_inches)

        contests = []
        contest_string = ""
        crop = image.crop(croplist)
        # indent by 1/10" to avoid edges, then crop single pixel lines,
        # finding beginning and end of zones which include dark pixels
        # now check each dark zone to see if it is a vote op
        # or if it is descriptive text; vote ops will have an oval
        # in the oval channel beginning at ov_off
        # and extending until ov_end
        dark_zones = self.get_dark_zones(crop, dark_intensity=160)
        contest_created = False
        for dz in dark_zones:
            zonecrop1 = crop.crop((const.dpi / 10, dz[0],
                                   crop.size[0] - (const.dpi / 10), dz[1]))
            zonecrop2 = crop.crop((ov_end, dz[0], txt_off, dz[1]))
            zone2stat = ImageStat.Stat(zonecrop2)
            zonecrop3 = crop.crop((txt_off, dz[0], txt_off + const.dpi, dz[1]))
            zone1text = ocr.tesseract(zonecrop1)
            zone1text = ocr.clean_ocr_text(zone1text)
            zone3text = ocr.tesseract(zonecrop3)
            zone3text = ocr.clean_ocr_text(zone3text)
            intensity_suggests_voteop = False
            length_suggests_voteop = False
            if zone2stat.mean[0] > 244: intensity_suggests_voteop = True
            if len(zone3text) < 6: length_suggests_voteop = True
            if not intensity_suggests_voteop and not length_suggests_voteop:
                contest_created = False
                contest_string += zone1text.replace("\n", "/")
            elif intensity_suggests_voteop and length_suggests_voteop:
                # create contest if none created, then
                if not contest_created:
                    contest_created = True
                    self.log.debug("Creating contest %s" % (contest_string, ))
                    regionlist.append(
                        Ballot.Contest(croplist[0], croplist[1] + dz[0],
                                       croplist[2], croplist[1] + dz[1], 0,
                                       contest_string))
                    contest_string = ""
                # add voteop to contest
                choice_string = zone3text
                self.log.debug("Adding choice %s" % (choice_string, ))
                regionlist[-1].append(
                    Ballot.Choice(croplist[0] + ov_off, croplist[1] + dz[0],
                                  choice_string))

            else:
                if contest_created:
                    contest_string += zone1text.replace("\n", "//")
                else:
                    self.log.debug(
                        "Problem determining whether contest or choice")
                    self.log.debug("Gap mean values %s" % (zone2stat.mean, ))
                    self.log.debug("Zone3 text %s" % (zone3text, ))
                    self.log.debug("Contest string: %s" % (contest_string, ))
        return dark_zones
예제 #14
0
def target_and_text(targetlist, textlist, textimage, debug=False):
    """associate each zone in textlist with best zone in first list

    Best is defined as follows:
    * text goes to the target that begins at or above 
    the end of the text plus 1/8 the text's height
    and is closest
    """
    target_text = {}
    target_text_code = {}

    try:
        targetlist.sort(key=lambda a: -a[0])
    except KeyError:
        print "KeyError trying to sort targetlist"
        #pdb.set_trace()
        pass
    textlist.sort()
    try:
        textlist = map(lambda a: list(a), textlist)
    except KeyError:
        print "KeyError trying to map textlist"
        #pdb.set_trace()
        pass
    if debug:
        print "Debug in target_and_text"
        print targetlist
        pdb.set_trace()
    img_num = 0
    for text in textlist:
        text_begin, text_end, eighth_text_height = text[0], text[1], (
            text[1] - text[0]) / 8
        # skip text not tall enough
        # pdb.set_trace()
        if text_end < (text_begin + (const.dpi / 20)):
            continue
        print "Checking targets against text beginnning %d ending %d" % (
            text_begin, text_end)
        for target in targetlist:
            code_strings = []
            #print "Target",target
            #pdb.set_trace()
            # skip targets not tall enough
            if target[1] <= (target[0] + (const.dpi / 20)):
                continue
            print "Target begins at %d ends at %d" % (target[0], target[1])
            target_begin = target[0]
            if (text_end + eighth_text_height) >= target_begin:
                if text_end > text_begin:
                    # in addition to ocr of the text,
                    # we also want a text-code that will
                    # allow us to merge OCR misreads.
                    # for starters, let's try getting
                    # the distance between mostly dark
                    # and mostly light stripes on the
                    # text line
                    cropped_image = textimage.crop(
                        (0, text_begin, textimage.size[0], text_end))
                    cropped_image.save("/tmp/cropped_image%d.jpg" %
                                       (img_num, ))
                    img_num += 1
                    # pdb.set_trace()
                    text_ocr = tesseract(cropped_image)
                    text_ocr = clean_ocr_text(text_ocr.strip())
                    try:
                        code_string = text_code(cropped_image)
                    except Exception, e:
                        print "Could not get code string, using '?'",
                        print e
                        code_string = "?"
                else:
                    text_ocr = ""
                    code_string = "0"
                text.append(text_ocr)
                code_strings.append(code_string)
                if target in target_text:
                    target_text[target].append(text)
                    target_text_code[target].append(code_strings)
                else:
                    target_text[target] = []
                    target_text[target].append(text)
                    target_text_code[target] = []
                    target_text_code[target].append(code_strings)
                break
예제 #15
0
    def get_title_and_votes_from(self,image,regionlist,croplist,last_title="NO TITLE"):
        """ given an area known to contain contest title and votes, return info

        The cropped area will contain a title area at the top, 
        followed by voting areas.  Voting areas will
        contain ovals in the oval column.  Descriptive text to the right of
        the ovals will be assigned to each oval based on being at or below
        the oval.

        """
        ov_off = adj(const.vote_target_horiz_offset_inches)
        ov_ht = adj(const.target_height_inches)
        ov_wd = adj(const.target_width_inches)
        ov_end = ov_off + ov_wd
        txt_off = adj(const.candidate_text_horiz_offset_inches)


        choices = []
        crop = image.crop(croplist)
        if croplist[2]==0 or croplist[3]==0:
            return []

        dark_zones = self.get_dark_zones(crop)

        next_dark_zones = dark_zones[1:]
        next_dark_zones.append([crop.size[1]-2,crop.size[1]-1])
        skipcount = 0


        # for each dark zone, determine the first dark x
        encountered_oval = False
        dzstyle = []
        for dz in dark_zones:
            # crop each dark strip
            # losing the area to the left of the possible vote target
            # and an equivalent area on the right
            dzcrop = crop.crop((ov_off,
                                dz[0],
                                crop.size[0]-ov_off,
                                dz[1]))

            firstx = dzcrop.size[0]
            lastx = 0
            for y in range(dzcrop.size[1]):
                for x in range(dzcrop.size[0]):
                    p0 = dzcrop.getpixel((x,y))
                    if p0[0] < 192:
                        firstx = min(firstx,x)
                        lastx = max(lastx,x)
            lastxindent = dzcrop.size[0]-lastx

            # unfortunately, it is hard to tell a filled oval from a title
            # that begins about the same x offset as ovals; we will
            # recognize that titles come first and are symmetric
            # ovals start at a defined offset and will have a minimum height
            # and, if empty, will match a particular dark/light pattern
            symmetric = (abs(firstx-lastxindent) < adj(0.05))
            tall_enough = (dz[1]-dz[0] >= int(ov_ht * .8))

            ov_pat = oval_pattern(dzcrop,ov_ht,ov_wd,txt_off-ov_off)

            if not encountered_oval and not ov_pat:
                dzstyle.append("T")

            elif tall_enough and firstx <= adj(0.02):
                dzstyle.append("V")
                encountered_oval = True

            elif ((firstx >= (txt_off - ov_off - adj(0.02))) and not tall_enough):
                dzstyle.append("W")
            else:
                dzstyle.append("-")


        contest_instance = None
        choice = None
        title_array = []
        contest_created = False
        for index,style in enumerate(dzstyle):
            if style=="T":
                titlezone = crop.crop((adj(0.1),
                                      dark_zones[index][0],
                                      crop.size[0]-adj(0.1),
                                      dark_zones[index][1]))
                zonetext = ocr.tesseract(titlezone)
                zonetext = ocr.clean_ocr_text(zonetext)
                zonetext = zonetext.strip()
                zonetext = zonetext.replace("\n","//").strip()
                title_array.append(zonetext)
            elif style=="V":
                if title_array is not None:
                    zonetext = "/".join(title_array)
                    title_array = None
                    if len(zonetext) < 4:zonetext = last_title
                    contest_instance = Ballot.Contest(croplist[0], 
                                                  croplist[1],
                                                  croplist[2],
                                                  croplist[3], 
                                                  0,
                                                  zonetext[:80])
                    contest_created = True
                    regionlist.append(contest_instance)
                if not contest_created:
                    print "WARNING: Choice but no contest."
                    pdb.set_trace()
                    continue
                choicezone = crop.crop((txt_off,
                                      dark_zones[index][0],
                                      crop.size[0]-adj(0.1),
                                      dark_zones[index][1]))
                zonetext = ocr.tesseract(choicezone)
                zonetext = ocr.clean_ocr_text(zonetext)
                zonetext = zonetext.strip()
                zonetext = zonetext.replace("\n","//").strip()

                # find the y at which the actual oval begins 
                # which may be lower than the dark_zone start
                choice_y = dark_zones[index][0]

                # Look up to 0.2 inches beneath beginning of dark zone
                # for an oval darkening the oval region
                contig = 0
                for adj_y in range(adj(0.2)):
                    ovalcrop = crop.crop((ov_off,
                                          choice_y+adj_y,
                                          ov_end,
                                          choice_y+adj_y+1))
                    ovalstat = ImageStat.Stat(ovalcrop)
                    if ovalstat.extrema[0][0] < 240:
                        contig += 1
                        if contig > adj(0.03):
                            choice_y += (adj_y-adj(0.03))
                            found = True
                            break
                    else:
                        contig = 0

                choice = Ballot.Choice(croplist[0]+ov_off, 
                                       croplist[1]+choice_y, 
                                       zonetext)
                contest_instance.append(choice)
                #if zonetext.startswith("Randy"):
                #    print "Randy"
                #    pdb.set_trace()
                #    print "Randy"
            elif style=="W" and len(dzstyle)>(index+1) and dzstyle[index+1] in "W-":
                if title_array is not None:
                    title_array = None

                try:
                    choice.description = "Writein"
                except:
                    pass
        return regionlist
예제 #16
0
def hart_build_contests(image,
                        pot_hlines,
                        vboxes,
                        column_start,
                        column_width,
                        dpi=300,
                        extensions=None):
    """Merge horiz lines and vote boxes to get contests and choice offsets."""
    regionlist = []
    contest_description_zones = []
    last_contest = 0
    first_above = 0
    for vbox in vboxes:
        vbox_y = vbox[1]
        for hline in pot_hlines:
            if hline < vbox_y:
                first_above = hline
        if first_above != last_contest:
            last_contest = first_above
            contest_description_zones.append((first_above, vbox_y))
    #print "Contest description zones",contest_description_zones
    for contest in contest_description_zones:
        # crop
        crop = image.crop((column_start, contest[0],
                           column_start + column_width, contest[1]))
        # get text
        zonetext = ocr.tesseract(crop)
        zonetext = ocr.clean_ocr_text(zonetext)
        # create Contest, append to regionlist
        regionlist.append(
            Ballot.Contest(column_start, contest[0],
                           column_start + column_width, contest[1], 0,
                           zonetext))

    contest_description_zones.reverse()
    for vbox in vboxes:
        # locate the last contest description zone above vbox
        # and assign vbox to that contest description zone

        for contest in contest_description_zones:
            # first contest above vbox gets vbox as choice
            if contest[0] < vbox[1]:
                #print "Vbox at",vbox[1],"in contest at",contest
                # crop area to right of vbox
                # get and clean text
                crop = image.crop((
                    vbox[0] + dpi / 3 + dpi / 30,  #!!!
                    vbox[1] - dpi / 100,  #!!!
                    vbox[0] + column_width - (dpi / 2),  #!!!
                    vbox[1] + (dpi / 2)))
                choice_text = ocr.tesseract(crop)  #!!!
                # take only first line of choice
                choice_text = ocr.clean_ocr_text(choice_text).split("/")[0]

                # search regionlist for matching Contest, append
                #match.append(Ballot.Choice(...,choice_text))
                for rcontest in regionlist:
                    if rcontest.y == contest[0] and rcontest.x == column_start:
                        rcontest.append(
                            Ballot.Choice(vbox[0], vbox[1], choice_text))
                        break
                break
    logger = logging.getLogger('')
    for contest in regionlist:
        logger.info("%d %d %s" % (contest.x, contest.y, contest.description))
        for choice in contest.choices:
            logger.info(" %d %d %s" % (choice.x, choice.y, choice.description))

    return regionlist
예제 #17
0
def get_text_for_arrow_at(im, x, y, global_dpi):
    """use tesseract to retrieve text corresponding to left of arrow

    Text associated with different arrows is separated by horizontal
    lines.  Find the y offsets of those lines and pass text between
    those offsets to tesseract, sending it a rectangle 2.25" wide from
    to the left of the arrow.

    The contest text is above a batch of arrows, and is separated from
    choice text by a thicker line.

    Text is run through ocr.clean_ocr_text and commas are deleted.

    Returns choice text, contest text, and crop rectangle for contest text.
    """
    # find center of arrow
    iround = lambda x: int(round(x))
    adj = lambda f: int(round(const.dpi * f))
    fortieth = int(global_dpi / 40.)
    topline = int(y - fortieth)
    bottomline = int(y + int(global_dpi * .22))
    startx = int(x - global_dpi)
    starty = int(y + int(global_dpi * .07))
    for up in range(global_dpi / 4):
        solid_line = True
        for xadj in range(global_dpi / 4):
            pix = im.getpixel((startx + xadj, starty - up))
            if pix[0] > 128:
                solid_line = False
                break
        if solid_line:
            topline = starty - up + 1
            break

    for down in range(global_dpi / 3):
        solid_line = True
        for xadj in range(global_dpi / 4):
            pix = im.getpixel((startx + xadj, starty + down))
            if pix[0] > 128:
                solid_line = False
                break
        if solid_line:
            bottomline = starty + down - 1
            break
    # add one to accomodate rough top line
    topline += 1
    # need to back up to beginning of column, now using 2.25 inches
    crop_x = x - (global_dpi * 2.25)
    crop_x = iround(crop_x)
    if crop_x < 0: crop_x = 0

    if topline < 0: topline = 0
    if bottomline <= topline: bottomline = topline + 1
    if bottomline >= im.size[1]: bottomline = im.size[1] - 1

    if crop_x < 0: crop_x = 0
    if x <= crop_x: x = crop_x + 1
    if x >= im.size[0]: x = im.size[0] - 1
    crop = im.crop((int(crop_x), int(topline), int(x), int(bottomline)))
    text = ocr.tesseract(crop)  # XXX self.extensions once in class
    text = ocr.clean_ocr_text(text)  # XXX self.extensions once in class
    choice_topline = int(topline)
    # now repeat process but going up until thicker black;
    # that will be the top of the contest
    contig = 0
    for up in range(global_dpi * 3):
        solid_line = True
        for xadj in range(global_dpi / 4):
            pix = im.getpixel((startx + xadj, topline - up))
            if pix[0] > 128:
                solid_line = False
                contig = 0
                break
        if solid_line:
            contig = contig + 1
            if contig >= int(global_dpi / 60.):
                topline = topline - up + 1
                break
    contest_croplist = (int(crop_x), int(topline), int(x), int(choice_topline))
    crop = im.crop(contest_croplist)
    contest_text = ocr.tesseract(crop)  # XXX self.extensions once in class
    contest_text = ocr.clean_ocr_text(
        contest_text)  # XXX self.extensions once in class
    text = text.replace("\n", " ").strip()
    contest_text = contest_text.replace("\n", " ").replace(",", "").strip()

    return text, contest_text, contest_croplist
예제 #18
0
def get_text_for_arrow_at(im,x,y,global_dpi):
    """use tesseract to retrieve text corresponding to left of arrow

    Text associated with different arrows is separated by horizontal
    lines.  Find the y offsets of those lines and pass text between
    those offsets to tesseract, sending it a rectangle 2.25" wide from
    to the left of the arrow.

    The contest text is above a batch of arrows, and is separated from
    choice text by a thicker line.

    Text is run through ocr.clean_ocr_text and commas are deleted.

    Returns choice text, contest text, and crop rectangle for contest text.
    """
    # find center of arrow
    iround = lambda x: int(round(x))
    adj = lambda f: int(round(const.dpi * f))
    fortieth = int(global_dpi/40.)
    topline = int(y - fortieth)
    bottomline = int(y + int(global_dpi * .22))
    startx = int(x - global_dpi)
    starty = int(y + int(global_dpi * .07))
    for up in range(global_dpi/4):
        solid_line = True
        for xadj in range(global_dpi/4):
            pix = im.getpixel((startx+xadj,starty-up))
            if pix[0]>128:
                solid_line = False
                break
        if solid_line:
            topline = starty-up+1
            break

    for down in range(global_dpi/3):
        solid_line = True
        for xadj in range(global_dpi/4):
            pix = im.getpixel((startx+xadj,starty+down))
            if pix[0]>128:
                solid_line = False
                break
        if solid_line:
            bottomline = starty+down-1
            break
    # add one to accomodate rough top line
    topline += 1
    # need to back up to beginning of column, now using 2.25 inches
    crop_x = x - (global_dpi*2.25)
    crop_x = iround(crop_x)
    if crop_x<0:crop_x = 0

    if topline < 0: topline = 0
    if bottomline <= topline: bottomline = topline + 1
    if bottomline >= im.size[1]: bottomline = im.size[1]-1
    
    if crop_x < 0: crop_x = 0
    if x <= crop_x: x = crop_x + 1
    if x >= im.size[0]: x = im.size[0] - 1
    crop = im.crop((int(crop_x),
                    int(topline),
                    int(x),
                    int(bottomline)))
    text = ocr.tesseract(crop) # XXX self.extensions once in class
    text = ocr.clean_ocr_text(text)# XXX self.extensions once in class
    choice_topline = int(topline)
    # now repeat process but going up until thicker black; 
    # that will be the top of the contest
    contig = 0
    for up in range(global_dpi*3):
        solid_line = True
        for xadj in range(global_dpi/4):
            pix = im.getpixel((startx+xadj,topline-up))
            if pix[0]>128:
                solid_line = False
                contig = 0
                break
        if solid_line:
            contig = contig + 1
            if contig >= int(global_dpi/60.):
                topline = topline-up+1
                break
    contest_croplist = (int(crop_x),
                        int(topline),
                        int(x),
                        int(choice_topline ) 
                        )
    crop = im.crop(contest_croplist)
    contest_text = ocr.tesseract(crop)# XXX self.extensions once in class
    contest_text = ocr.clean_ocr_text(contest_text)# XXX self.extensions once in class
    text = text.replace("\n"," ").strip()
    contest_text = contest_text.replace("\n"," ").replace(",","").strip()

    return text, contest_text, contest_croplist