예제 #1
0
def main():

    dirout = '/Users/dariaulybina/Desktop/georgetown/global-economics/convert_pdfs/pdfminer_p2/tag_converted_docs/'
    dirin = '/Users/dariaulybina/Desktop/georgetown/global-economics/scrape_articles/pdfs_downloaded/'
    layoutmode = 'normal'
    codec = 'utf-8'
    laparams = LAParams()
    caching = True
    stripcontrol = True

    pdf_list = list_files(dirin)

    print pdf_list

    for fn in pdf_list:
        fname = os.path.join(dirin, fn)
        print fname
        file_out = fn.replace('.pdf', '.tag')
        outfile = os.path.join(dirout, file_out)
        #print(outfile)
        outfp = file(outfile, 'w')
        fp = file(fname, 'rb')
        rsrcmgr = PDFResourceManager(caching=caching)
        #device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
        #device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
        #device = HTMLConverter(rsrcmgr, outfp, codec=codec,layoutmode=layoutmode, laparams=laparams)
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp):
            interpreter.process_page(page)
        fp.close()
        device.close()
        outfp.close()
        print 'Document done'
    print 'Finished all documents'
예제 #2
0
def getTransContent(fp, resultfp):
    outfp = cStringIO.StringIO()

    # This secontion contains pdf parsing boilerplate
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=caching)
    device = TagExtractor(rsrcmgr, outfp, codec=codec)
    # end boilerplate for pdf parsing

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    fp.close()

    parseTrans(outfp.getvalue(), resultfp)
    device.close()
예제 #3
0
    def run(self):
        rsrcmgr = PDFResourceManager(caching=self._caching)
        if not self._outtype:
            self._outtype = 'text'
            if __name__ == '__main__':
                if self._outfile:
                    if self._outfile.endswith('.htm') or self._outfile.endswith('.html'):
                        self._outtype = 'html'
                    elif self._outfile.endswith('.xml'):
                        self._outtype = 'xml'
                    elif self._outfile.endswith('.tag'):
                        self._outtype = 'tag'
        if __name__ == '__main__':
            if self._outfile:
                outfp = file(self._outfile, 'w')
            else:
                outfp = sys.stdout
        else:
            from cStringIO import StringIO
            outfp = StringIO()
        if self._outtype == 'text':
            device = TextConverter(rsrcmgr, outfp, codec=self._codec, laparams=self._laparams, imagewriter=self._imagewriter)
        elif self._outtype == 'xml':
            device = XMLConverter(rsrcmgr, outfp, codec=self._codec, laparams=self._laparams, imagewriter=self._imagewriter)
        elif self._outtype == 'html':
            device = HTMLConverter(rsrcmgr, outfp, codec=self._codec, scale=self._scale, layoutmode=self._layoutmode, laparams=self._laparams, imagewriter=self._imagewriter)
        elif self._outtype == 'tag':
            device = TagExtractor(rsrcmgr, outfp, codec=self._codec)
        else:
            return usage()
        for fname in self._args:
            fp = file(fname, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            for page in PDFPage.get_pages(fp, self._pagenos, maxpages=self._maxpages, password=self._password, caching=self._caching, check_extractable=True):
                interpreter.process_page(page)

            fp.close()
        device.close()
        if __name__ == '__main__':
            outfp.close()
        else:
            return outfp.getvalue()
예제 #4
0
def main(argv):
    import getopt

    def usage():
        print(
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
            ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
            ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
            ' [-t text|html|xml|tag] [-c codec] [-s scale]'
            ' file ...' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return
예제 #5
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='Convert PDF into text.')
    parser.add_argument('file',
                        nargs='*',
                        type=argparse.FileType('rb'),
                        default=sys.stdin,
                        help='file(s) to convert')
    parser.add_argument('-C',
                        '--nocache',
                        dest='cache',
                        action='store_false',
                        help='prevent object caching (slower)')
    parser.add_argument('-l',
                        metavar='level',
                        default='warn',
                        help='logging level (warn, info, debug)')
    parser.add_argument('-p',
                        metavar='page',
                        nargs='+',
                        default=[],
                        type=int,
                        help='page number(s) (space separated)')
    parser.add_argument('-m',
                        metavar='maxpages',
                        default=0,
                        type=int,
                        help='maximum number of pages to extract')
    parser.add_argument('-P',
                        metavar='password',
                        default='',
                        help='pdf password')
    parser.add_argument('-o',
                        metavar='outfile',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help='output file name (default: stdout)')
    parser.add_argument('-O',
                        metavar='directory',
                        type=ImageWriter,
                        help='extract images and save to directory')
    parser.add_argument('-t',
                        metavar='outtype',
                        help='output type (text, html, xml, tag)')
    parser.add_argument('-c',
                        metavar='codec',
                        default='utf-8',
                        help='output text encoding (default: %(default)s)')
    lagroup = parser.add_argument_group(title='layout analysis')
    lagroup.add_argument('-n',
                         action='store_true',
                         help='disable layout analysis')
    lagroup.add_argument('-A',
                         action='store_true',
                         help='force layout analysis on all text')
    lagroup.add_argument('-V',
                         action='store_true',
                         help='detect vertical text')
    lagroup.add_argument('-M',
                         metavar='char_margin',
                         type=float,
                         help='custom character margin')
    lagroup.add_argument('-L',
                         metavar='line_margin',
                         type=float,
                         help='custom line margin')
    lagroup.add_argument('-W',
                         metavar='word_margin',
                         type=float,
                         help='custom word margin')
    lagroup.add_argument('-F',
                         metavar='boxes_flow',
                         type=float,
                         help='custom boxes flow')
    lagroup.add_argument('-Y',
                         metavar='layout_mode',
                         default='normal',
                         help='layout mode for HTML (normal, exact, loose)')
    lagroup.add_argument('-s',
                         metavar='scale',
                         default=1,
                         type=float,
                         help='output scaling for HTML')
    args = parser.parse_args(argv)

    logging.basicConfig()
    logging.getLogger('pdfminer').setLevel(args.l.upper())

    laparams = LAParams()
    if args.n:
        laparams = None
    else:
        laparams.all_texts = args.A
        laparams.detect_vertical = args.V
        if args.M:
            laparams.char_margin = args.M
        if args.L:
            laparams.line_margin = args.L
        if args.W:
            laparams.word_margin = args.W
        if args.F:
            laparams.boxes_flow = args.F

    rsrcmgr = PDFResourceManager(caching=args.cache)
    outtype = args.t
    if not outtype:
        if args.o:
            if args.o.name.endswith('.htm') or args.o.name.endswith('.html'):
                outtype = 'html'
            elif args.o.name.endswith('.xml'):
                outtype = 'xml'
            elif args.o.name.endswith('.tag'):
                outtype = 'tag'
    if outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              args.o,
                              codec=args.c,
                              laparams=laparams,
                              imagewriter=args.O)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               args.o,
                               codec=args.c,
                               scale=args.s,
                               layoutmode=args.Y,
                               laparams=laparams,
                               imagewriter=args.O)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, args.o, codec=args.c)
    else:
        device = TextConverter(rsrcmgr,
                               args.o,
                               codec=args.c,
                               laparams=laparams,
                               imagewriter=args.O)
    for fp in args.file:
        process_pdf(rsrcmgr,
                    device,
                    fp, [i - 1 for i in args.p],
                    maxpages=args.m,
                    password=args.P,
                    caching=args.cache,
                    check_extractable=True)
        fp.close()
    device.close()
    if args.o is not sys.stdout:
        args.o.close()
예제 #6
0
def main(argv):
    import getopt

    def usage():
        print(
            f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]'
            ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]'
            ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]'
            ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]'
            ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...')
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:')
    except getopt.GetoptError:
        return usage()
    if not args:
        return usage()
    # debug option
    debug = 0
    # input option
    password = b''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    # pageno = 1
    scale = 1
    caching = True
    # showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d':
            debug += 1
        elif k == '-P':
            password = v.encode('ascii')
        elif k == '-o':
            outfile = v
        elif k == '-t':
            outtype = v
        elif k == '-O':
            imagewriter = ImageWriter(v)
        elif k == '-c':
            encoding = v
        elif k == '-s':
            scale = float(v)
        elif k == '-R':
            rotation = int(v)
        elif k == '-Y':
            layoutmode = v
        elif k == '-p':
            pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m':
            maxpages = int(v)
        elif k == '-S':
            stripcontrol = True
        elif k == '-C':
            caching = False
        elif k == '-n':
            laparams = None
        elif k == '-A':
            laparams.all_texts = True
        elif k == '-V':
            laparams.detect_vertical = True
        elif k == '-M':
            laparams.char_margin = float(v)
        elif k == '-W':
            laparams.word_margin = float(v)
        elif k == '-L':
            laparams.line_margin = float(v)
        elif k == '-F':
            laparams.boxes_flow = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = open(outfile, 'w', encoding=encoding)
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        with open(fname, 'rb') as fp:
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)
    device.close()
    outfp.close()

    bad_words = [
        'Personal', 'Information', 'Projects', 'Internship', 'Technologies'
    ]
    with open('cv.txt') as oldfile, open('cv_new.txt', 'w') as newfile:
        for line in oldfile:
            if not any(bad_word in line for bad_word in bad_words):
                newfile.write(line)

    file = open("cv_new.txt", "r")
    s = file.read()
    s = s.split('\n')

    while ("" in s):
        s.remove("")
    while (" " in s):
        s.remove(" ")
    while ("\x0c" in s):
        s.remove("\x0c")

    details = []
    i = 0
    while (i < len(s)):
        s1 = s[i].split(': ')
        if (len(s1) > 1):
            details.append(s1[1])
        i += 1

    sql = "INSERT INTO entries (name, post, exp) VALUES (%s, %s, %s)"
    val = (details[0], details[1], details[2])
    mycursor.execute(sql, val)
    mydb.commit()
    print(mycursor.rowcount, "record inserted.")
    return
예제 #7
0
def main():
    # print argv
    # import getopt
    # def usage():
    #     print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
    #            ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
    #            ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]'
    #            ' [-t text|html|xml|tag] [-c codec] [-s scale]'
    #            ' file ...' % argv[0])
    #     return 100
    # try:
    #     (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:')
    # except getopt.GetoptError:
    #     return usage()
    # if not args: return usage()
    # # debug option
    # debug = 0
    # # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # # output option
    # outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    # showpaeno = True
    laparams = LAParams()
    debug = 1
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug

    rsrcmgr = PDFResourceManager(caching=caching)
    outfile = r'/Users/chenjunbiao/project/graduation_project/data/xmlout.xml'
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return
    # for fname in args:
    #     fp = file(fname, 'rb')
    #     interpreter = PDFPageInterpreter(rsrcmgr, device)
    #     for page in PDFPage.get_pages(fp, pagenos,
    #                                   maxpages=maxpages, password=password,
    #                                   caching=caching, check_extractable=True):
    #         page.rotate = (page.rotate+rotation) % 360
    #         interpreter.process_page(page)
    #     fp.close()
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()
    outfp.close()
    return
예제 #8
0
파일: pdf2txt.py 프로젝트: white3/pdf2word
def extract_text_to_fp(inf, outfp,
                    _py2_no_more_posargs=None,  # Bloody Python2 needs a shim
                    output_type='text', codec='utf-8', laparams = None,
                    maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
                    layoutmode='normal', output_dir=None, strip_control=False,
                    debug=False, disable_caching=False, **other):
    """
    Parses text from inf-file and writes to outfp file-like object.
    Takes loads of optional arguments but the defaults are somewhat sane.
    Beware laparams: Including an empty LAParams is not the same as passing None!
    Returns nothing, acting as it does on two streams. Use StringIO to get strings.
    
    output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works properly.
    codec: Text decoding codec
    laparams: An LAParams object from pdfminer.layout.
        Default is None but may not layout correctly.
    maxpages: How many pages to stop parsing after
    page_numbers: zero-indexed page numbers to operate on.
    password: For encrypted PDFs, the password to decrypt.
    scale: Scale factor
    rotation: Rotation factor
    layoutmode: Default is 'normal', see pdfminer.converter.HTMLConverter
    output_dir: If given, creates an ImageWriter for extracted images.
    strip_control: Does what it says on the tin
    debug: Output more logging data
    disable_caching: Does what it says on the tin
    """
    if six.PY2 and sys.stdin.encoding:
        password = password.decode(sys.stdin.encoding)

    imagewriter = None
    if output_dir:
        imagewriter = ImageWriter(output_dir)
    
    rsrcmgr = PDFResourceManager(caching=not disable_caching)

    if output_type == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)

    if six.PY3 and outfp == sys.stdout:
        outfp = sys.stdout.buffer

    if output_type == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=strip_control)
    elif output_type == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter)
    elif output_type == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)

    interpreter = PDFPageInterpreter(rsrcmgr, device)

    #增加一个空列表用于保存每页提取的文本内容
    t_list= []
    seek_pos = 0 #缓冲区首指针
    for page in PDFPage.get_pages(inf,
                                  page_numbers,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=not disable_caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
        #将指针定位到未读取位置
        outfp.seek(seek_pos)
        #将提取的文本内容保存到列表中 
        t_list.append(outfp.read())
        #将read()后的缓冲区底部指针保存,以备下次从此位置读取文本
        seek_pos = outfp.tell() 
    device.close()
    #增加了一个返回列表
    return t_list
예제 #9
0
def getPDFReferencesTitle():

    import sys
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfdevice import PDFDevice, TagExtractor
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
    from pdfminer.cmapdb import CMapDB
    from pdfminer.layout import LAParams
    from pdfminer.image import ImageWriter
    import re
    import codecs

    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    srcFile = "E:\\论文\\SNP\\学位论文\\基于Relief和SVM-RFE的组合式SNP特征选择.pdf".decode(
        "utf8").encode("gbk")

    # output option
    outfile = "E:\\论文\\SNP\\学位论文\\基于Relief和SVM-RFE的组合式SNP特征选择.txt".decode(
        "utf8").encode("gbk")
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()

    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)

    fp = file(srcFile, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)

    fp.close()
    device.close()
    outfp.close()
    fp = open(outfile, "r")
    data = fp.read()
    fp.close()

    if data[:3] == codecs.BOM_UTF8:
        data = data[3:]
    '''英文论文'''

    position = find_last(data, "REFERENCES")
    refstr = data[position + 10:]

    #refstr = open("ref.txt", "r").read()
    '''英文论文'''

    references = re.split("\n\d+.", refstr)
    '''中文论文'''
    '''
    position = find_last(data, "参考文献")
    refstr = data[position + 13:]

    references = re.split("\[\d+\]", refstr)
    
    fp = open("ref.txt", "w")
    fp.write(refstr)
    fp.close()
    '''
    '''
    for ref in references :
        print ref.decode("utf8").encode("gbk")
        print "\n"
    print len(references)
    '''

    fp = open("title.txt", "w")
    num = 0
    for ref in references:
        ''' 
        str = ref.split(".") 
        if len(str) > 1 :       
            str[1] = str[1].replace("\n", " ")
            str[1] = re.subn("\[\w\]", "", str[1])           
            fp.write(str[1][0].strip())
            fp.write("\n")        
            #num += 1
        '''
        str = re.split("\(\d+\)", ref)
        if len(str) > 1:
            str = str[1].split(".")
            str[0] = str[0].replace("\n", "")
            fp.write(str[0].strip())
            fp.write("\n")
            #num += 1

    fp.close()
예제 #10
0
def ConvertPdf(pdfpath, outfp, opts={}):
    import sys
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfdevice import PDFDevice, TagExtractor
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
    from pdfminer.cmapdb import CMapDB
    from pdfminer.layout import LAParams
    from pdfminer.image import ImageWriter

    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager()
    if not outtype:
        outtype = 'txt'
    if outtype == 'txt':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    fp = file(pdfpath, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()

    return True
예제 #11
0
def main(argv):
    import getopt
    def usage():
        print(f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]'
               ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]'
               ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]'
               ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]'
               ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...')
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = b''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-P': password = v.encode('ascii')
        elif k == '-o': outfile = v
        elif k == '-t': outtype = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-c': encoding = v
        elif k == '-s': scale = float(v)
        elif k == '-R': rotation = int(v)
        elif k == '-Y': layoutmode = v
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-S': stripcontrol = True
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        if sys.platform == 'linux':
            outfp = open(outfile, 'w', encoding=encoding)
        elif sys.platform == 'win32':
            outfp = open(outfile, 'wb')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        with open(fname, 'rb') as fp:
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp, pagenos,
                                          maxpages=maxpages, password=password,
                                          caching=caching, check_extractable=True):
                page.rotate = (page.rotate+rotation) % 360
                interpreter.process_page(page)
    device.close()
    outfp.close()
    return
예제 #12
0
def convertPDF(outfile, pdfFile):
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    #outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    """    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #"""
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        pass  #return usage()
    fname = pdfFile  #for fname in args:
    fp = file(fname, 'rb')
    process_pdf(rsrcmgr,
                device,
                fp,
                pagenos,
                maxpages=maxpages,
                password=password,
                caching=caching,
                check_extractable=True)
    fp.close()
    device.close()
    outfp.close()
    return
예제 #13
0
def parsepdf_pdfminer_formal(path, outtype='txt'):
    # debug option
    debug = 0
    # input option
    password = b''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = r'C:\Users\Administrator\Desktop\parseRes_demo.' + outtype
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    laparams.all_texts = True
    laparams.detect_vertical = True
    # for (k, v) in opts:
    #     if k == '-d': debug += 1
    #     elif k == '-P': password = v.encode('ascii')
    #     elif k == '-o': outfile = v
    #     elif k == '-t': outtype = v
    #     elif k == '-O': imagewriter = ImageWriter(v)
    #     elif k == '-c': encoding = v
    #     elif k == '-s': scale = float(v)
    #     elif k == '-R': rotation = int(v)
    #     elif k == '-Y': layoutmode = v
    #     elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
    #     elif k == '-m': maxpages = int(v)
    #     elif k == '-S': stripcontrol = True
    #     elif k == '-C': caching = False
    #     elif k == '-n': laparams = None
    #     elif k == '-A': laparams.all_texts = True
    #     elif k == '-V': laparams.detect_vertical = True
    #     elif k == '-M': laparams.char_margin = float(v)
    #     elif k == '-W': laparams.word_margin = float(v)
    #     elif k == '-L': laparams.line_margin = float(v)
    #     elif k == '-F': laparams.boxes_flow = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = open(outfile, 'w', encoding=encoding)
    if outtype == 'txt':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    # else:
    #     return usage()
    # for fname in args:
    with open(path, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos,
                                        maxpages=maxpages, password=password,
                                        caching=caching, check_extractable=True):
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
    device.close()
    outfp.close()
    return
예제 #14
0
def main(argv):

    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    using_optparse = False

    parser = ArgumentParser(prog='pdf2txt.py',
                            description='Convert pdf to txt',
                            formatter_class=ArgumentDefaultsHelpFormatter)

    if using_optparse:
        DEBUG(3, 'using optparse')
        parser.add_argument = parser.add_option
        parser.parse_known_args = parser.parse_args
        parser.disable_interspersed_args()

    parser.add_argument('-d',
                        dest='debuglevel',
                        action='count',
                        default=0,
                        help='Debug (repeat for more verbose debugging)')

    parser.add_argument(
        '-p',
        '--pages',
        dest='pagenos',
        action='store',
        type=str,
        default='',
        help=
        'Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.'
    )

    parser.add_argument('-c',
                        '--codec',
                        dest='codec',
                        action='store',
                        type=str,
                        default='utf-8',
                        help='Specifies the output codec.')

    parser.add_argument(
        '-t',
        '--type',
        dest='outtype',
        action='store',
        type=str,
        default='shape',
        choices=['text', 'html', 'xml', 'tag', 'shape'],
        help='Specifies the output format, one of: shape, text, html, xml, tag'
    )

    parser.add_argument(
        '-m',
        dest='maxpages',
        action='store',
        type=int,
        default=0,
        help=
        'Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.'
    )

    parser.add_argument(
        '-P',
        '--password',
        dest='password',
        action='store',
        type=str,
        default='',
        help='Provides the user password to access PDF contents.')

    parser.add_argument(
        '-o',
        '--output',
        dest='outfile',
        action='store',
        type=str,
        default=None,
        help=
        'Specifies the output file name. By default, it prints the extracted contents to stdout in text format.'
    )

    parser.add_argument(
        '-C',
        '--no-caching',
        dest='caching',
        action='store_false',
        default=True,
        help=
        'Suppress object caching. This will reduce the memory consumption but also slows down the process.'
    )

    parser.add_argument('-n',
                        '--no-layout',
                        dest='layout',
                        action='store_false',
                        default=True,
                        help='Suppress layout analysis.')

    parser.add_argument('--show-pageno',
                        dest='show_pageno',
                        action='store_true',
                        default=False,
                        help='Show page numbers.')

    parser.add_argument(
        '-A',
        '--analyze-all',
        dest='all_texts',
        action='store_true',
        default=False,
        help=
        'Forces to perform layout analysis for all the text strings, including text contained in figures.'
    )

    parser.add_argument('-V',
                        '--detect-vertical',
                        dest='detect_vertical',
                        action='store_true',
                        default=False,
                        help='Allows vertical writing detection.')

    parser.add_argument(
        '-M',
        dest='char_margin',
        action='store',
        type=float,
        default=2.0,
        help=
        'Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.'
    )

    parser.add_argument(
        '-L',
        dest='line_margin',
        action='store',
        type=float,
        default=0.5,
        help=
        'Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.'
    )

    parser.add_argument(
        '-W',
        dest='word_margin',
        action='store',
        type=float,
        default=0.1,
        help=
        'It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.'
    )

    parser.add_argument(
        '-F',
        dest='boxes_flow',
        action='store',
        type=float,
        default=0.5,
        help=
        'Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).'
    )

    parser.add_argument(
        '-Y',
        '--layout-mode',
        dest='layoutmode',
        action='store',
        type=str,
        default='normal',
        choices=['exact', 'normal', 'loose'],
        help=
        'Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.'
    )

    parser.add_argument('-O',
                        '--image-writer',
                        dest='imagewriter',
                        action='store',
                        type=str,
                        default=None,
                        help='imagewriter')

    parser.add_argument('-R',
                        '--rotation',
                        dest='rotation',
                        action='store',
                        type=int,
                        default=0,
                        help='rotation')

    parser.add_argument('-S',
                        '--strip-control',
                        dest='stripcontrol',
                        action='store_true',
                        default=False,
                        help='stripcontrol')

    parser.add_argument(
        '-s',
        dest='scale',
        action='store',
        type=float,
        default=1,
        help='Specifies the output scale. Can be used in HTML format only.')

    parser.add_argument(
        '--draw-lines',
        dest='draw_lines',
        action='store_true',
        help=
        "Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output."
    )

    parser.add_argument(
        '--draw-boxes',
        dest='draw_boxes',
        action='store_true',
        help=
        "Draw crude page representation, coloured TextBoxes (= grouped text lines). Valid only for the `shape' output."
    )

    parser.add_argument(
        '--draw-blocks',
        dest='draw_blocks',
        action='store_true',
        help=
        "Draw crude page representation, coloured TextBlocks (= grouped TextBoxes). Valid only for the `shape' output."
    )

    parser.add_argument(
        '--shear-limit',
        dest='shear_limit',
        action='store',
        default=0.1,
        type=float,
        help=
        "If the text is sheared above this limit, reject it. Valid only for the `shape' output."
    )

    parser.add_argument(
        '--rotation-limit',
        dest='rotation_limit',
        action='store',
        default=2,
        type=float,
        help=
        "If the text is rotated above this angle (in degrees), reject it. Valid only for the `shape' output."
    )

    parser.add_argument(
        '--line-height-diff',
        dest='line_height_diff',
        action='store',
        type=float,
        default=0.1,
        help=
        'Two lines whose vertical sizes differ more than this ratio are not to be considered of the same paragraph (but e.g. one of them is a heading).'
    )

    parser.add_argument('--heading-before',
                        dest='heading_before',
                        action='store',
                        type=str,
                        default='',
                        help='String to put before each heading, e.g. <h1>')

    parser.add_argument('--heading-after',
                        dest='heading_after',
                        action='store',
                        type=str,
                        default='',
                        help='String to put after each heading, e.g. </h1>')

    parser.add_argument(
        '--box-separator',
        dest='box_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate boxes with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--block-separator',
        dest='block_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate blocks with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--indent-separator',
        dest='indent_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate indented lines with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--indent-string',
        dest='indent_string',
        action='store',
        type=str,
        default=r'\t',
        help=
        r'Put this string in front of indented lines. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--indent-limit',
        dest='indent_limit',
        action='store',
        type=float,
        default=3,
        help=
        'If the line is indented more then this (approximately characters), it will separated by --indent-separator from the previous one.'
    )

    parser.add_argument(
        '--page-separator',
        dest='page_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate pages with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--norm-whitespace',
        dest='norm_whitespace',
        action='store_true',
        default=False,
        help=
        'Normalize whitespace (remove duplicate spaces, replace end of lines with spaces).'
    )

    parser.add_argument(
        '--print-stats',
        dest='print_stats',
        action='store_true',
        default=False,
        help=
        'Instead of the text, output some simple statistics about the file.')

    parser.add_argument(
        '--max-blocks',
        dest='max_blocks',
        action='store',
        default=0,
        type=int,
        help=
        'If there is more than this blocks per page, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" file). 0 means no limit. 50 is maybe a good value.'
    )

    parser.add_argument(
        '--max-textlines',
        dest='max_textlines',
        action='store',
        default=0,
        type=int,
        help=
        'If there is more than this textlines per any block, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" page). 0 means no limit. 18 is maybe a good value.'
    )

    parser.add_argument(
        '--line-height-method',
        dest='line_height_method',
        action='store',
        type=str,
        default='bbox',
        choices=['bbox', 'mean', 'median'],
        help=
        'Method to calculate height of line (relevant if there are characters with uneven height). bbox takes the bounding box (rectangle encompassing the line), mean the arithmetic mean of the height of all the characters, median is the median of the height of all the characters. Use mean or median if there are outlier characters, e.g. one big character at the beginning of line.'
    )

    parser.add_argument(dest='pdffile',
                        help='List of PDF files to go through',
                        default=None,
                        nargs='+')

    args, rest = parser.parse_known_args()

    global debuglevel
    debuglevel = debug = args.debuglevel
    DEBUG(3, 'args:', str(args))
    DEBUG(3, 'rest:', str(rest))

    DEBUG(3, 'optparse:', using_optparse)

    if args.pagenos:
        pagenos.update(int(x) - 1 for x in args.pagenos.split(','))
    maxpages = args.maxpages
    outfile = args.outfile
    password = args.password
    caching = args.caching
    showpageno = args.show_pageno
    if not args.layout:
        laparams = None
    if laparams and args.all_texts:
        laparams.all_texts = True
    if laparams and args.detect_vertical:
        laparams.detect_vertical = True
    if laparams:
        laparams.char_margin = args.char_margin
        laparams.line_margin = args.line_margin
        laparams.word_margin = args.word_margin
        laparams.boxes_flow = args.boxes_flow
    layoutmode = args.layoutmode

    if args.imagewriter:
        imagewriter = ImageWriter(args.imagewriter)

    rotation = args.rotation
    stripcontrol = args.stripcontrol
    outtype = args.outtype
    codec = args.codec
    scale = args.scale

    args.box_separator = unescape_string(args.box_separator)
    args.block_separator = unescape_string(args.block_separator)
    args.indent_separator = unescape_string(args.indent_separator)
    args.indent_string = unescape_string(args.indent_string)

    args.page_separator = unescape_string(args.page_separator)

    global options
    options = args

    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug

    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
        DEBUG(2, 'output goes to', outfile)
    else:
        outfp = sys.stdout
        DEBUG(2, 'output goes to stdout')
    if outtype == 'shape':
        device = ShapeTextConverter(rsrcmgr,
                                    outfp,
                                    codec=codec,
                                    laparams=laparams,
                                    showpageno=showpageno,
                                    imagewriter=imagewriter)
    elif outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in options.pdffile:
        DEBUG(2, 'processing', fname)
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()

    outfp.close()
    DEBUG(2, 'finished.')

    return
예제 #15
0
def main(argv):
    import getopt
    def usage():
        print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:'
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
               ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
               ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
               ' [-t text|html|xml|tag] [-c codec] [-s scale]'
               ' file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = 'tag'
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'tag'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout

    for fname in args:
        l = glob.glob(fname)
        count = len(l)
        print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format'
        for pdf in l:
#             print pdf
            d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'}
            ext = '.' + d[outtype]
            outfile = pdf[0:-4] + ext
            print outfile
            outfp = file(outfile, 'wb')
            if outtype == 'text':
                device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'xml':
                device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                      imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'html':
                device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                                       layoutmode=layoutmode, laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'tag':
                device = TagExtractor(rsrcmgr, outfp, codec=codec)
                device.showpageno = False
            else:
                return usage()
    
            fp = file(pdf, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp, pagenos,
                                          maxpages=maxpages, password=password,
                                          caching=caching, check_extractable=True):
                page.rotate = (page.rotate+rotation) % 360
                interpreter.process_page(page)
            fp.close()
            device.close()
            outfp.close()

        print 'Done'
    return
예제 #16
0
def main(argv):
    import getopt

    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()

    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return
예제 #17
0
def main(argv):
    def usage():
        print((
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
            '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
            '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...'
            % argv[0]))
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    debug = False
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug = True
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)

    if debug:
        set_debug_logging()
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
        close_outfp = True
    else:
        outfp = sys.stdout
        close_outfp = False
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               outdir=outdir,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        fp = io.open(fname, 'rb')
        process_pdf(rsrcmgr,
                    device,
                    fp,
                    pagenos,
                    maxpages=maxpages,
                    password=password,
                    caching=caching,
                    check_extractable=True)
        fp.close()
    device.close()
    if close_outfp:
        outfp.close()
def get_paper_content(fname, pages=2, outdir="data"):
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    basename = os.path.basename(fname)
    basename = basename.replace(".pdf", "")
    outfile = os.path.join(outdir, basename + ".html")
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()

    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return

    fp = file(fname, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    interpreter.debug = True
    try:
        for index, page in enumerate(
                PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True)):
            if index > pages:
                break
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
    except:
        print(fname)
        return

    fp.close()
    device.close()
    outfp.close()
    return