Exemplo n.º 1
0
 def domatch(self):
     """
     Do the matching and counting based on lines in pdf's txt content file, 
         use tuple to store matches 
         use list to store matches within a line a file
         each match consists of line number and pattern with context
     The structure of match result looks like:
         [[(pageno, lineno, location, context),...],...]
         [[(1,1,location1, ...match...)],[(1,20,location2, ...match...),(2,32,location3, ...match...)]]
     """
     import re
     pattern = normalizer.normalize_pattern(self.pattern)
     if self.icase:
         pattern = re.compile(pattern, re.I)
     else:
         pattern = re.compile(pattern)
     
     count = 0
     results = []
     for line in self.lines:
         # line: (pageno, lineno, cotent)
         pageno = line[0]
         lineno = line[1]
         linecontent = line[2]
         if pattern.search(linecontent):
             res = []
             for match in pattern.finditer(linecontent):
                 count += 1
                 # context
                 if self.context > 0:
                     s = self.startindex(match.start(), self.context)
                     e = match.end() + self.context 
                 else:
                     s = self.startposition(match.start(), linecontent)
                     e = self.endposition(match.end(), linecontent)
                 # location
                 if self.location:
                     # toc
                     if self.dictionary:
                         toc = TOC(self.file, path=self.dictionary)
                     else:
                         toc = TOC(self.file)
                     toc_dictionary = toc.gettoc_filter_by_dictionary(self.lines)
                     location = self.matchlocation(lineno, toc_dictionary)
                 else:
                     location = 'All'              
                 res.append((pageno,lineno,location,linecontent[s:e]))
                 #print count
             results.append(res)
     self.count = count
     return results
Exemplo n.º 2
0
def main(argv):
    import getopt
    def usage():
        print ('Usage: %s [Option] File ...\n'
               'Options:\n'
               '    -o, --output OUTFILE \n'
               '        Specify the output file. \n'
               '    -i, --minlen Num \n'
               '        Print the number of matches for each input file, instead of normal ouput. \n'
               '    -m, --maxlen Num \n'
               '        Ingnore case distinctions.\n'
               '    -f, --file-prefix \n'
               '        Prefix each line of output with input file. \n'
               '    -n, --line-number \n'
               '        Prefix each line of output with 1-based line number within its txt file. \n'
               '    -t, --threshold FLOAT \n'
               '        The probability of a TOC item can be computed using p=frequency/total. \n'
               '        The TOC items with probability p < FLOAT will be ignored in output. \n'
               '        FLOAT should between 0 and 1. \n'
               '    -p, --path PATH \n'
               '        Specify the TOC dictionary directory. \n'
               '    -d, --dictionary \n'
               '        Use TOC dictionary as filter in constructing TOC. \n'
               '    -h, --help \n'
               '        Print usage information. \n' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'o:i:m:fnt:p:dh', 
                                     ['output=','minlen=','maxlen=','file-prefix',
                                      'line-number','threshold=','path=','dictionary=','help'])
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    
    # options
    outfile = None
    minlen = 4
    maxlen = 25
    pfile = False
    pnumber = False
    threshold = 0.05
    path = '../docs/json'
    dictionary = False
    #
    for (k, v) in opts:
        if k in ('-o', '--output'): outfile = v
        elif k in ('-i', '--minlen'): minlen = string.atoi(v)
        elif k in ('-m', '--maxlen'): maxlen = string.atoi(v)
        elif k in ('-f', '--file-prefix'): pfile = True
        elif k in ('-n', '--line-number'): pnumber = True
        elif k in ('-t', '--threshold'): threshold = string.atof(v)
        elif k in ('-p', '--path'): path = v
        elif k in ('-d', '--dictionary'): dictionary = True
        elif k in ('-h', '--help'): return usage()
    #    
    if outfile:
        f = file(outfile, 'w')
    else:
        f = sys.stdout
    
    # extract toc        
    for pdffile in args:
        # pdftoc
        toc = TOC(pdffile, minlen=minlen, maxlen=maxlen,
                  threshold=threshold,path=path)
        
        if dictionary:
            toc_toc = toc.buildtoc_dict()
        else:
            toc_toc = toc.buildtoc()

        # output        
        f.write('\n{0}\n{1}'.format(pdffile, ''.join(toc.formats(toc_toc, pfile, pnumber))))
    f.close()
    
    return
Exemplo n.º 3
0
def main(argv):
    import getopt

    def usage():
        print(
            "Usage: %s [Option] File ...\n"
            "Options:\n"
            "    -o, --output OUTFILE \n"
            "        Specify the output file. \n"
            "    -i, --minlen Num \n"
            "        Print the number of matches for each input file, instead of normal ouput. \n"
            "    -m, --maxlen Num \n"
            "        Ingnore case distinctions.\n"
            "    -f, --file-prefix \n"
            "        Prefix each line of output with input file. \n"
            "    -n, --line-number \n"
            "        Prefix each line of output with 1-based line number within its txt file. \n"
            "    -t, --threshold FLOAT \n"
            "        The probability of a TOC item can be computed using p=frequency/total. \n"
            "        The TOC items with probability p < FLOAT will be ignored in output. \n"
            "        FLOAT should between 0 and 1. \n"
            "    -p, --path PATH \n"
            "        Specify the TOC dictionary directory. \n"
            "    -d, --dictionary \n"
            "        Use TOC dictionary as filter in constructing TOC. \n"
            "    -h, --help \n"
            "        Print usage information. \n" % argv[0]
        )
        return 100

    try:
        (opts, args) = getopt.getopt(
            argv[1:],
            "o:i:m:fnt:p:dh",
            [
                "output=",
                "minlen=",
                "maxlen=",
                "file-prefix",
                "line-number",
                "threshold=",
                "path=",
                "dictionary=",
                "help",
            ],
        )
    except getopt.GetoptError:
        return usage()
    if not args:
        return usage()

    # options
    outfile = None
    minlen = 4
    maxlen = 25
    pfile = False
    pnumber = False
    threshold = 0.05
    path = "../docs/json"
    dictionary = False
    #
    for (k, v) in opts:
        if k in ("-o", "--output"):
            outfile = v
        elif k in ("-i", "--minlen"):
            minlen = string.atoi(v)
        elif k in ("-m", "--maxlen"):
            maxlen = string.atoi(v)
        elif k in ("-f", "--file-prefix"):
            pfile = True
        elif k in ("-n", "--line-number"):
            pnumber = True
        elif k in ("-t", "--threshold"):
            threshold = string.atof(v)
        elif k in ("-p", "--path"):
            path = v
        elif k in ("-d", "--dictionary"):
            dictionary = True
        elif k in ("-h", "--help"):
            return usage()
    #
    if outfile:
        f = file(outfile, "w")
    else:
        f = sys.stdout

    # extract toc
    for pdffile in args:
        # pdftoc
        toc = TOC(pdffile, minlen=minlen, maxlen=maxlen, threshold=threshold, path=path)

        if dictionary:
            toc_toc = toc.buildtoc_dict()
        else:
            toc_toc = toc.buildtoc()

        # output
        f.write("\n{0}\n{1}".format(pdffile, "".join(toc.formats(toc_toc, pfile, pnumber))))
    f.close()

    return