Пример #1
0
 def setRightEdge(self,modertm):
    for r in modertm:
       if misc.isEqual(self.x1,r,Xequalness):
          self.rightedge = 1
          return
       if misc.isEqual(self.x1,self.rtmargin,Xequalness):
          self.rightedge = 2
       else:
          self.rightedge = 3
Пример #2
0
   def calcIndent(self, i, MOLS):
      global Yequalness, edgemargin

      currentEdge=self.lines[i].getx0()
      if i != 0:
         prevEdge=self.lines[i-1].getx0()
      else:
         prevEdge=currentEdge
         
      if i == len(self.lines)-1:
         nextEdge = prevEdge# if last line of page
      else:
         nextEdge = self.lines[i+1].getx0()

      # the following four lines adds a bias to each edge value if any of them are negative
      edges = [prevEdge,currentEdge,nextEdge]
      edges.sort()
      
      if edges[0] < 0:
         bias = abs(edges[0])
         prevEdge = prevEdge + bias
         currentEdge = prevEdge + bias
         nextEdge = prevEdge + bias

      # if the adjacent two lines are aligned *AND* the current line is indented
      if (self.lines[i].getLS() == "?" \
         or misc.isEqual(prevEdge,nextEdge,Xequalness) \
         or self.lines[i].getLS() > MOLS*10) \
         and currentEdge > ((1+edgemargin)*nextEdge):
         return abs(nextEdge-currentEdge) # Then the line is indented
      else:
         return 0 # otherwise not
Пример #3
0
   def calcMargins(self, MOLS, MOLEN):
      # generate a list of x1,y1 pairs.  From this a frequency graph can be
      # generated, and from that, the number of columns can be established
      x1list = []
      minLineLen = MOLEN/MaxColumns
      for line in self.lines:
         # only consider this line if it is wide enough to possibly be a whole line,
         # using the approximation that there will be at most, MaxColumns on one page.
         if line.getlength() > minLineLen or misc.isEqual(line.getlength(), minLineLen,Xequalness):
            x1list.append( (line.getx1(),line.gety()) )
      
      ## x{0,1}list contain (x,y) pairs for each line on the page ##

      # compress each list down to frequency,value pairs
      self.rtMargins = self.findFreq( x1list )
      self.rtMargins.sort( lambda a,b: int(a[1]-b[1]) )         # just to be quite sure

      ## {rt,lt}Margins NOW contain (f,x-av,y-av,y-min,y-max)   ##

      # We assume there will be no more than Maxcolumns (real) columns in the page.
      # More can be apparent due to tables, non-text objects, etc, and this
      # serves to eliminate some of these.
      if len(self.rtMargins) > MaxColumns:
         del self.rtMargins[:-MaxColumns]       
   
      if len(self.rtMargins) == 0:
         return # no columns were found, so can't apply them
      # lines with no right margin set will be handled in Page.predict_para
   
      # all the extra information is there for historical reasons
      # we need a copy of the margins to give to setRTmargin that doesn't have all that in it
      tmpRTmargins = []
      for m in self.rtMargins:
         tmpRTmargins.append(m[1])
   
      # now each line must be set with its margin as appropriate
      for l in self.lines:
         l.setRTmargin( tmpRTmargins )
Пример #4
0
   def findFreq( self, list ):
      if len(list) == 0:        # can't operate on nothing
         msg( "findFreq: WARNING: called with an empty input list!!" )
         return []

      output = []                       # output list
      f = 0                     # frequency of val
      val = list[0]             # start with the first element
      valcum = (0,0)            # cumulation of vals
      min = max = val[1]                # minimum and maximum y values
   
      # since val's aren't strictly the same, we'll average the val's to get a
      # more accurate view of what val actually is
   
      for l in list:
         if misc.isEqual(val[0],l[0],Xequalness):
            f = f + 1           # we've found another val
            valcum = (valcum[0] + l[0], valcum[1] + l[1])
            if l[1] < min:
               min = l[1]
            elif l[1] > max:
               max = l[1]
         else:
            output.append( (f, round(valcum[0]/f,1),round(valcum[1]/f,1), min, max ) )
            # reset
            val = l             # next val
            f = 1               # and we've already found one
            valcum = l
            min = max = val[1]

      output.append( (f, round(valcum[0]/f,1),round(valcum[1]/f,1), min, max) )
      output.sort()       # sort on frequency (pri) and x (sec) 

      # output now contains the list of consecutive frequencies.
   
      # I found the following few lines of code very difficult to comment,
      # so please excuse the bad explanation.
      #
      # 'output contains a list of (consecutive frequency,value) pairs.  That
      # means you can have several records that have the same value, but since
      # they weren't consecutive in the input list, they were not combined.
      #
      # for example, [(1,10),(2,8),(1,10),(1,9),(5,10)]
      #
      # in that example, 10 is the value with a sufficiently high frequency,
      # and so all values less than 10 must be deleted and all (x,10) records
      # must be combined.
   
      # this code finds the first record whose f is >= MinConsecFreq
      i = 0               # where to cut
      while i < len(output) and output[i][0] < MinConsecFreq:
         i = i + 1      # find first i where output[i] >= MinConsecFreq

      # if i points off the list, no eligible candidates were found.
      # When this happens, we take the largest frequency found, which will
      # be the last record in the list.
      if i == len(output):
         i = i - 1 

      # this code backtracks to ensure that any occurrances of the val
      # associated with the i'th record also gets included.  Without this,
      # the frequency reported for the val[i] might be too small because
      # some small groups (less than MinConsecFreq) were discounted
      # because their f's were too small.  Sort of.  Try and make sense
      # of that.
      j = 0
      while j < i:
         if misc.isEqual(output[j][1],output[i][1],Xequalness):
            j = j + 1
         else:
            del output[j]
         i = i - 1

      # output now contains only eligible frequencies which must now be combined
      # ie, [(1,10),(2,10),(1,11)] ==> [(3,10),(1,11)]
      #
      output.sort( lambda a,b: int(a[1] - b[1]) ) # sort on x value 
      i = 0
      while i <= len(output) - 2:       # go from 0 to second to last index
         if misc.isEqual( output[i][1], output[i+1][1],Xequalness ): #abs(output[i][1] - output[i+1][1]) <= 2:
            output[i] = self.mergeRecords( output[i], output[i+1], lambda a,b: round((a+b)/2) )
            del output[i+1]
         else:
            i = i + 1

      return output
Пример #5
0
   def calcValues(self, MOLS, MOLEN, modeRightMargins, lastPageLastLine):
      global Xequalness
      global Yequalness

      for i in range(len(self.lines)):
         self.lines[i].setRightEdge(modeRightMargins)           
   
         AVLS_CL=self.lines[i].localAverage=self.calcLocalAverage(i,MOLS)
         NL_AVLS=self.lines[i].getNumberLocalAverage() # NL_AVLS -- Number of lines in the local average calc
         LS_CL=self.lines[i].getLS() # LS_CL -- linespace of the current line
         if i < len(self.lines)-1:
            LS_NL=self.lines[i+1].getLS()
         else:
            LS_NL=10*MOLS
   
         if LS_CL == '?':
            self.lines[i].LS_CL_missing = YES
         
            if lastPageLastLine != None:
               if misc.isEqual(lastPageLastLine.getlength(),MOLEN,Xequalness): self.lines[i].lastpagelastline_eq_MOLEN = YES
               if misc.isEqual(lastPageLastLine.getx1(),lastPageLastLine.getRTmargin(),Xequalness): self.lines[i].RTedge_lastpagelastline_eq_RTmargin = YES
             
         else:
            # We are only interested in the absolute space between lines,
            # and no longer direction, and it is important that we make sure,
            # we do not have any negative values in the linespaces from now on.
            LS_CL=abs(LS_CL)
            LN_PL=self.lines[i-1].getlength() # LN_PL -- Length of the previous line
            LN_CL=self.lines[i].getlength() # LN_CL -- Length of the previous line
   
            self.lines[i].diffLS_AVLS = LS_CL-AVLS_CL
            self.lines[i].diffModeLineSpace = LS_CL-MOLS
            self.lines[i].diffModeLength = LN_CL-MOLEN
         
            if self.calcIndent(i, MOLS) != 0:
               self.lines[i].isIndented = YES
            if i > 0:
               self.lines[i].lastindent = abs(self.lines[i].getx0()-self.lines[i-1].getx0())
            if i < len(self.lines)-1:
               self.lines[i].nextindent = abs(self.lines[i].getx0()-self.lines[i+1].getx0())
   
            if misc.isEqual(LS_CL,MOLS,Yequalness):
               self.lines[i].LS_CL_eq_MOLS = YES
            if misc.isEqual(LS_CL,AVLS_CL,Yequalness):
               self.lines[i].LS_CL_eq_AVLS = YES
            if misc.isEqual(LS_CL,LS_NL,Yequalness):
               self.lines[i].LS_CL_eq_LS_NL = YES
            if LS_CL == AVLS_CL:
               self.lines[i].LS_CL_exactly_AVLS = YES
            if LS_CL < AVLS_CL:
               self.lines[i].LS_CL_lt_AVLS = YES
            if LS_CL > AVLS_CL:
               self.lines[i].LS_CL_gt_AVLS = YES
            if LS_CL > MOLS * 10:
               self.lines[i].isColumnbreak = YES
            if NL_AVLS == 1:
               self.lines[i].NL_AVLS_eq_1 = YES
            if misc.isEqual(LS_NL,MOLS,Yequalness):
               self.lines[i].LS_NL_eq_MOLS = YES
            if misc.isEqual(LN_PL,MOLEN,Xequalness):
               self.lines[i].LN_PL_eq_MOLEN = YES
            if misc.isEqual(LN_CL,MOLEN,Xequalness):
               self.lines[i].LN_CL_eq_MOLEN = YES
            if misc.isEqual(self.lines[i-1].getx1(),self.lines[i-1].getRTmargin(),Xequalness):
               self.lines[i].RTedge_PL_eq_RTmargin = YES
            if misc.isEqual(self.lines[i  ].getx1(),self.lines[i  ].getRTmargin(),Xequalness):
               self.lines[i].RTedge_CL_eq_RTmargin = YES
Пример #6
0
 def setRTmargin(self, rtMargins):
    for m in range(len(rtMargins)):
       if m == len(rtMargins)-1 or misc.isEqual(self.getx1(), rtMargins[m], Xequalness) or self.getx1() < rtMargins[m]:
          self.rtmargin = rtMargins[m]
          break
    self.rtwhitespace = round(self.rtmargin-self.x1,1)