Exemplo n.º 1
0
 def domatch(self):
     """
     Do the matching and counting based on lines in pdf's txt content file, 
         use tuple to store matches 
         use list to store matches within a line a file
         each match consists of line number and pattern with context
     The structure of match result looks like:
         [[(pageno, lineno, location, context),...],...]
         [[(1,1,location1, ...match...)],[(1,20,location2, ...match...),(2,32,location3, ...match...)]]
     """
     import re
     pattern = normalizer.normalize_pattern(self.pattern)
     if self.icase:
         pattern = re.compile(pattern, re.I)
     else:
         pattern = re.compile(pattern)
     
     count = 0
     results = []
     for line in self.lines:
         # line: (pageno, lineno, cotent)
         pageno = line[0]
         lineno = line[1]
         linecontent = line[2]
         if pattern.search(linecontent):
             res = []
             for match in pattern.finditer(linecontent):
                 count += 1
                 # context
                 if self.context > 0:
                     s = self.startindex(match.start(), self.context)
                     e = match.end() + self.context 
                 else:
                     s = self.startposition(match.start(), linecontent)
                     e = self.endposition(match.end(), linecontent)
                 # location
                 if self.location:
                     # toc
                     if self.dictionary:
                         toc = TOC(self.file, path=self.dictionary)
                     else:
                         toc = TOC(self.file)
                     toc_dictionary = toc.gettoc_filter_by_dictionary(self.lines)
                     location = self.matchlocation(lineno, toc_dictionary)
                 else:
                     location = 'All'              
                 res.append((pageno,lineno,location,linecontent[s:e]))
                 #print count
             results.append(res)
     self.count = count
     return results