def setRightEdge(self,modertm): for r in modertm: if misc.isEqual(self.x1,r,Xequalness): self.rightedge = 1 return if misc.isEqual(self.x1,self.rtmargin,Xequalness): self.rightedge = 2 else: self.rightedge = 3
def calcIndent(self, i, MOLS): global Yequalness, edgemargin currentEdge=self.lines[i].getx0() if i != 0: prevEdge=self.lines[i-1].getx0() else: prevEdge=currentEdge if i == len(self.lines)-1: nextEdge = prevEdge# if last line of page else: nextEdge = self.lines[i+1].getx0() # the following four lines adds a bias to each edge value if any of them are negative edges = [prevEdge,currentEdge,nextEdge] edges.sort() if edges[0] < 0: bias = abs(edges[0]) prevEdge = prevEdge + bias currentEdge = prevEdge + bias nextEdge = prevEdge + bias # if the adjacent two lines are aligned *AND* the current line is indented if (self.lines[i].getLS() == "?" \ or misc.isEqual(prevEdge,nextEdge,Xequalness) \ or self.lines[i].getLS() > MOLS*10) \ and currentEdge > ((1+edgemargin)*nextEdge): return abs(nextEdge-currentEdge) # Then the line is indented else: return 0 # otherwise not
def calcMargins(self, MOLS, MOLEN): # generate a list of x1,y1 pairs. From this a frequency graph can be # generated, and from that, the number of columns can be established x1list = [] minLineLen = MOLEN/MaxColumns for line in self.lines: # only consider this line if it is wide enough to possibly be a whole line, # using the approximation that there will be at most, MaxColumns on one page. if line.getlength() > minLineLen or misc.isEqual(line.getlength(), minLineLen,Xequalness): x1list.append( (line.getx1(),line.gety()) ) ## x{0,1}list contain (x,y) pairs for each line on the page ## # compress each list down to frequency,value pairs self.rtMargins = self.findFreq( x1list ) self.rtMargins.sort( lambda a,b: int(a[1]-b[1]) ) # just to be quite sure ## {rt,lt}Margins NOW contain (f,x-av,y-av,y-min,y-max) ## # We assume there will be no more than Maxcolumns (real) columns in the page. # More can be apparent due to tables, non-text objects, etc, and this # serves to eliminate some of these. if len(self.rtMargins) > MaxColumns: del self.rtMargins[:-MaxColumns] if len(self.rtMargins) == 0: return # no columns were found, so can't apply them # lines with no right margin set will be handled in Page.predict_para # all the extra information is there for historical reasons # we need a copy of the margins to give to setRTmargin that doesn't have all that in it tmpRTmargins = [] for m in self.rtMargins: tmpRTmargins.append(m[1]) # now each line must be set with its margin as appropriate for l in self.lines: l.setRTmargin( tmpRTmargins )
def findFreq( self, list ): if len(list) == 0: # can't operate on nothing msg( "findFreq: WARNING: called with an empty input list!!" ) return [] output = [] # output list f = 0 # frequency of val val = list[0] # start with the first element valcum = (0,0) # cumulation of vals min = max = val[1] # minimum and maximum y values # since val's aren't strictly the same, we'll average the val's to get a # more accurate view of what val actually is for l in list: if misc.isEqual(val[0],l[0],Xequalness): f = f + 1 # we've found another val valcum = (valcum[0] + l[0], valcum[1] + l[1]) if l[1] < min: min = l[1] elif l[1] > max: max = l[1] else: output.append( (f, round(valcum[0]/f,1),round(valcum[1]/f,1), min, max ) ) # reset val = l # next val f = 1 # and we've already found one valcum = l min = max = val[1] output.append( (f, round(valcum[0]/f,1),round(valcum[1]/f,1), min, max) ) output.sort() # sort on frequency (pri) and x (sec) # output now contains the list of consecutive frequencies. # I found the following few lines of code very difficult to comment, # so please excuse the bad explanation. # # 'output contains a list of (consecutive frequency,value) pairs. That # means you can have several records that have the same value, but since # they weren't consecutive in the input list, they were not combined. # # for example, [(1,10),(2,8),(1,10),(1,9),(5,10)] # # in that example, 10 is the value with a sufficiently high frequency, # and so all values less than 10 must be deleted and all (x,10) records # must be combined. # this code finds the first record whose f is >= MinConsecFreq i = 0 # where to cut while i < len(output) and output[i][0] < MinConsecFreq: i = i + 1 # find first i where output[i] >= MinConsecFreq # if i points off the list, no eligible candidates were found. # When this happens, we take the largest frequency found, which will # be the last record in the list. if i == len(output): i = i - 1 # this code backtracks to ensure that any occurrances of the val # associated with the i'th record also gets included. Without this, # the frequency reported for the val[i] might be too small because # some small groups (less than MinConsecFreq) were discounted # because their f's were too small. Sort of. Try and make sense # of that. j = 0 while j < i: if misc.isEqual(output[j][1],output[i][1],Xequalness): j = j + 1 else: del output[j] i = i - 1 # output now contains only eligible frequencies which must now be combined # ie, [(1,10),(2,10),(1,11)] ==> [(3,10),(1,11)] # output.sort( lambda a,b: int(a[1] - b[1]) ) # sort on x value i = 0 while i <= len(output) - 2: # go from 0 to second to last index if misc.isEqual( output[i][1], output[i+1][1],Xequalness ): #abs(output[i][1] - output[i+1][1]) <= 2: output[i] = self.mergeRecords( output[i], output[i+1], lambda a,b: round((a+b)/2) ) del output[i+1] else: i = i + 1 return output
def calcValues(self, MOLS, MOLEN, modeRightMargins, lastPageLastLine): global Xequalness global Yequalness for i in range(len(self.lines)): self.lines[i].setRightEdge(modeRightMargins) AVLS_CL=self.lines[i].localAverage=self.calcLocalAverage(i,MOLS) NL_AVLS=self.lines[i].getNumberLocalAverage() # NL_AVLS -- Number of lines in the local average calc LS_CL=self.lines[i].getLS() # LS_CL -- linespace of the current line if i < len(self.lines)-1: LS_NL=self.lines[i+1].getLS() else: LS_NL=10*MOLS if LS_CL == '?': self.lines[i].LS_CL_missing = YES if lastPageLastLine != None: if misc.isEqual(lastPageLastLine.getlength(),MOLEN,Xequalness): self.lines[i].lastpagelastline_eq_MOLEN = YES if misc.isEqual(lastPageLastLine.getx1(),lastPageLastLine.getRTmargin(),Xequalness): self.lines[i].RTedge_lastpagelastline_eq_RTmargin = YES else: # We are only interested in the absolute space between lines, # and no longer direction, and it is important that we make sure, # we do not have any negative values in the linespaces from now on. LS_CL=abs(LS_CL) LN_PL=self.lines[i-1].getlength() # LN_PL -- Length of the previous line LN_CL=self.lines[i].getlength() # LN_CL -- Length of the previous line self.lines[i].diffLS_AVLS = LS_CL-AVLS_CL self.lines[i].diffModeLineSpace = LS_CL-MOLS self.lines[i].diffModeLength = LN_CL-MOLEN if self.calcIndent(i, MOLS) != 0: self.lines[i].isIndented = YES if i > 0: self.lines[i].lastindent = abs(self.lines[i].getx0()-self.lines[i-1].getx0()) if i < len(self.lines)-1: self.lines[i].nextindent = abs(self.lines[i].getx0()-self.lines[i+1].getx0()) if misc.isEqual(LS_CL,MOLS,Yequalness): self.lines[i].LS_CL_eq_MOLS = YES if misc.isEqual(LS_CL,AVLS_CL,Yequalness): self.lines[i].LS_CL_eq_AVLS = YES if misc.isEqual(LS_CL,LS_NL,Yequalness): self.lines[i].LS_CL_eq_LS_NL = YES if LS_CL == AVLS_CL: self.lines[i].LS_CL_exactly_AVLS = YES if LS_CL < AVLS_CL: self.lines[i].LS_CL_lt_AVLS = YES if LS_CL > AVLS_CL: self.lines[i].LS_CL_gt_AVLS = YES if LS_CL > MOLS * 10: self.lines[i].isColumnbreak = YES if NL_AVLS == 1: self.lines[i].NL_AVLS_eq_1 = YES if misc.isEqual(LS_NL,MOLS,Yequalness): self.lines[i].LS_NL_eq_MOLS = YES if misc.isEqual(LN_PL,MOLEN,Xequalness): self.lines[i].LN_PL_eq_MOLEN = YES if misc.isEqual(LN_CL,MOLEN,Xequalness): self.lines[i].LN_CL_eq_MOLEN = YES if misc.isEqual(self.lines[i-1].getx1(),self.lines[i-1].getRTmargin(),Xequalness): self.lines[i].RTedge_PL_eq_RTmargin = YES if misc.isEqual(self.lines[i ].getx1(),self.lines[i ].getRTmargin(),Xequalness): self.lines[i].RTedge_CL_eq_RTmargin = YES
def setRTmargin(self, rtMargins): for m in range(len(rtMargins)): if m == len(rtMargins)-1 or misc.isEqual(self.getx1(), rtMargins[m], Xequalness) or self.getx1() < rtMargins[m]: self.rtmargin = rtMargins[m] break self.rtwhitespace = round(self.rtmargin-self.x1,1)