示例#1
0
    def __parse_word(self, pdf_fullpath):
        """
        单个解析PDF转成文字
        不支持文字则弹窗提示
        """

        fp = open(pdf_fullpath, 'rb')  # 以二进制读模式打开
        # 用文件对象来创建一个pdf文档分析器
        parser = PDFParser(fp)
        # 创建一个PDF文档, 提供初始化密码
        doc = PDFDocument(parser)
        # 连接分析器 与文档对象
        parser.set_document(doc)

        # 检测文档是否提供txt转换,不提供就忽略
        if not doc.is_extractable:
            return {
                'errCode': 2001,
                'result': {
                    'message': 'PDF文件不支持文字版',
                    'pdf_fullpath': pdf_fullpath
                }
            }
        else:
            # 创建PDF资源管理器 来管理共享资源
            rsrcmgr = PDFResourceManager()
            # 创建一个PDF设备对象
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            # 创建一个PDF解释器对象
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量
            (num_page, num_image, num_curve, num_figure,
             num_TextBoxHorizontal) = (0, 0, 0, 0, 0)

            # 循环遍历列表,每次处理一个page的内容
            for page in PDFPage.create_pages(doc):  # 获取page列表
                num_page += 1  # 页面增一
                interpreter.process_page(page)
                # 接受该页面的LTPage对象
                layout = device.get_result()
                for x in layout:
                    if isinstance(x, LTImage):  # 图片对象
                        num_image += 1
                    if isinstance(x, LTCurve):  # 曲线对象
                        num_curve += 1
                    if isinstance(x, LTFigure):  # figure对象
                        num_figure += 1
                    if isinstance(x, LTTextBoxHorizontal):  # 获取文本内容
                        num_TextBoxHorizontal += 1  # 水平文本框对象增一
                        # 保存文本内容
                        file_without_suffix = os.path.basename(
                            pdf_fullpath[:str(pdf_fullpath).rfind('.')])
                        with open(self.filedir +
                                  '/%s.txt' % file_without_suffix,
                                  'a',
                                  encoding='utf-8') as f:  # 生成txt文件的文件名及路径
                            results = x.get_text()
                            f.write(results)
                            f.write('\n')
            if num_TextBoxHorizontal <= 0:
                return {
                    'errCode': 2001,
                    'result': {
                        'message': 'PDF文件不支持文字版',
                        'pdf_fullpath': pdf_fullpath
                    }
                }
            else:
                return {
                    'errCode': 0,
                    'result': {
                        '对象数量': num_figure,
                        '页面数': num_page,
                        '图片数': num_image,
                        '曲线数': num_curve,
                        '水平文本框': num_TextBoxHorizontal
                    }
                }
示例#2
0
def parse():
    # rb以二进制读模式打开本地pdf文件

    fn = open('半监督模糊聚类及其应用_杨昔阳.pdf', 'rb')

    # 创建一个pdf文档分析器

    parser = PDFParser(fn)

    # 创建一个PDF文档

    doc = PDFDocument()

    # 连接分析器 与文档对象

    parser.set_document(doc)

    doc.set_parser(parser)

    # 提供初始化密码doc.initialize("lianxipython")

    # 如果没有密码 就创建一个空的字符串

    doc.initialize("")

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:

        raise PDFTextExtractionNotAllowed

    else:

        # 创建PDf资源管理器

        resource = PDFResourceManager()

    # 创建一个PDF参数分析器

    laparams = LAParams()

    # 创建聚合器,用于读取文档的对象

    device = PDFPageAggregator(resource, laparams=laparams)

    # 创建解释器,对文档编码,解释成Python能够识别的格式

    interpreter = PDFPageInterpreter(resource, device)

    # 循环遍历列表,每次处理一页的内容

    # doc.get_pages() 获取page列表
    for page in doc.get_pages():

        # 利用解释器的process_page()方法解析读取单独页数

        interpreter.process_page(page)

    # 使用聚合器get_result()方法获取内容

    layout = device.get_result()

    # 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象
    for out in layout:

        # 判断是否含有get_text()方法,获取我们想要的文字
        if hasattr(out, "get_text"):

            print(out.get_text())

        with open('test.txt', 'a') as f:

            f.write(out.get_text() + '\n')

    if __name__ == '__main__':

        parse()
示例#3
0
lines=[line.rstrip('\n') for line in map_file]

#for each article

for line in lines:
	# copy filename from pdf document
	filename=line.replace('.pdf','.xml')	
	# open new xml file
	xfp = open('xml_data/' + filename, 'w')
	
	# open pdf file
	fp = open('fichedir/'+line, 'rb')
	
	# set pdfminer resources
	rsrcmgr = PDFResourceManager()
	laparams = LAParams()
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	pages = PDFPage.get_pages(fp)
	
	# counter
	current_page = 1
	xfp.write('<pages>')
	for page in pages:
		# output monitor
		if(current_page==1):
			print current_row, ' ', filename
		
		# markup page-level data
		xfp.write("<page>\n")
示例#4
0
def main(argv):
	for arg in argv[1:]:
		fd = open(arg, 'rb')
		parser = PDFParser(fd)
		document = PDFDocument(parser)
		if not document.is_extractable:
			print("Document not extractable.")
			return 1
	
		params = LAParams(char_margin=1)
		resMan = PDFResourceManager(caching=True)
		device = PDFPageAggregator(resMan, laparams=params)
		interpreter = PDFPageInterpreter(resMan, device)
		parser = x86ManParser("html", params)
	
		i = 1
		for page in PDFPage.get_pages(fd, set(), caching=True, check_extractable=True):
			print("Processing page %i" % i)
			interpreter.process_page(page)
			page = device.get_result()
			parser.process_page(page)
			i += 1
		parser.flush()
		fd.close()

		# RG: We get the parse in just one file: html/AAA.html
		# Looks like the original repo does not create all the separate pages for all the
		# instructions nor the index.html
		# Dus gewoon een grote parse in een file met als filename de eerste instructie.
		# En later heeft ie de losse pagina's daar dan weer uit gehaald door te zoeken op <h3>
		# en de closing tags toe te voegen.
		# En zo ook een index.html gemaakt en zelf een style.css erbij.		
		# NOTE_: we are getting 3 sorts of Description: <p>, <table> and <svg>
		# Op zijn website is het alleen <p> dus hij heeft zeker nog nabewerkingen gedaan.
		# Ook toevoegen van <pre> en <h2> etc.
		# Dit is dus alleen een grove parse om de tekst en tabellen eruit te krijgen.
		# Gezien de issues (e.g. problemen met footnotes in MOV) is het waarschijnlijk beter om
		# te gaan werken met de html die pdf2txt maakt. Is wel niet zo clean maar geeft minder
		# problemen.
		# Kijkend naar de resultaten van alle pdf2html conversies ziet het er naar uit dat het toch 
		# niet zo makkelijk programmatisch gaat. zneak/felix heeft zijn best gedaan en genereert cleane 
		# html maar er zitten toch nog veel fouten in (zie issues). pdf2txt maakt een nette layout maar 
		# in de tabellen gaat het vaak mis en moet je nog veel nabewerken. Het is te vergelijken met 
		# pdf2music: soms lukt het maar meestal ziet het er niet goed uit en kun je beter alles handmatig 
		# doen. Veel werk maar geeft het beste resultaat.
		# pdf2txt gebruikt trouwens spans voor tabellen. Lelijk.
		# DONE: checked out pdftohtml from Xpdf. This produces the best looking pages. But also no real
		# tables. It uses a png file as background for the tables and then lays everything out with
		# absolutely positioned divs. For exact positioning that seems the way to go. But also slight
		# mistakes in the table layout. Faster (C++, Qt) and better than pdfminer.six.
		# But no real tables is faking it...
		# NOTE_: at autoclose we are getting mismatch: th strong when parsing the full vol2a.pdf
		# Something goes wrong.
		# Figures are extracted as svg but often look warped (e.g. Figure 3-18 and 3-19 at HADDPS)
		# PDF parsing is like unscrambling scrambled eggs...
		# DONE: checked out pdf2htmlEX. It creates perfectly looking html 5 pages. It can be done!
		# It is fast and puts everything in one html page.
		# TODO_: check out https://github.com/fmalina/unilex-transcript which promisses to create
		# clean (semantic) html from pdf2htmlEX output.

		# NOTE_: Conversion result geeft altijd 0/0 omdat we niet in de code komen waar success en fail worden
		# geincrementeerd. Het zijn dus loze variabelen.
		print("Conversion result: %i/%i" % (parser.success, parser.success + parser.fail))
示例#5
0
def arc():
    destino = str(formato.get())
    if destino == "Arquivo do Word": destino = "docx"
    if destino == "Arquivo do Power-Point": destino = "ppt"
    if destino == "Arquivo do Excel": destino = "xlsx"
    if destino == "Arquivo de Texto": destino = "txt"
    import win32com.client as win32
    from os import path
    in_file = path.abspath(diretorio)
    out_file = path.abspath(filename)

    if destino == "docx":
        if file_extension in ArqDOCX or file_extension.lower(
        ) == ".pdf" or file_extension.lower() == ".txt":
            word = win32.DispatchEx("Word.Application")
            word.Visible = 0
            word.DisplayAlerts = 0
            doc = word.Documents.Open(in_file)
            doc.SaveAs(out_file, FileFormat=16)
            doc.Close()
            word.Quit()

    elif destino.lower() == "pdf":
        if file_extension.lower() in ArqPPT:
            word = win32.DispatchEx("PowerPoint.Application")
            word.Visible = 0
            word.DisplayAlerts = 0
            doc = word.Presentations.Open(in_file)
            doc.SaveAs(out_file, FileFormat=32)
            doc.Close()
            word.Quit()
        elif file_extension.lower() in ArqXLSX:
            word = win32.DispatchEx("Excel.Application")
            word.Visible = 0
            word.DisplayAlerts = 0
            doc = word.Workbooks.Open(in_file)
            doc.ExportAsFixedFormat(0, out_file)
            doc.Close()
            word.Quit()
        elif file_extension.lower() in ArqDOCX or file_extension.lower(
        ) == ".txt":
            word = win32com.client.Dispatch('Word.Application')
            word.Visible = 0
            word.DisplayAlerts = 0
            doc = word.Documents.Open(in_file)
            doc.SaveAs(in_file, FileFormat=17)
            doc.Close()
            word.Quit()

    elif destino.lower() == "xlsx":
        if file_extension.lower() == ".pdf":
            import pdftables_api
            c = pdftables_api.Client('to7jluln0hvr')
            c.xlsx(diretorio, filename + '.xlsx')
        elif file_extension.lower() == ".txt" or file_extension.lower(
        ) in ArqDOCX:
            import pandas as pd
            df = pd.read_csv(diretorio, header=None, delim_whitespace=True)
            df.to_excel(filename + '.xlsx', index=False, header=None)

    elif destino.lower() == "txt":
        if file_extension in ArqDOCX:
            import docx2txt
            text = docx2txt.process(diretorio)
            with open(filename + ".txt", "w") as file:
                print(text, file=file)
        elif file_extension.lower() == ".pdf":
            from io import StringIO
            from pdfminer.pdfparser import PDFParser
            from pdfminer.pdfdocument import PDFDocument
            from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
            from pdfminer.converter import TextConverter
            from pdfminer.layout import LAParams
            from pdfminer.pdfpage import PDFPage
            output_string = StringIO()
            with open(diretorio, 'rb') as in_file:
                parser = PDFParser(in_file)
                doc = PDFDocument(parser)
                rsrcmgr = PDFResourceManager()
                device = TextConverter(rsrcmgr,
                                       output_string,
                                       laparams=LAParams())
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                for page in PDFPage.create_pages(doc):
                    interpreter.process_page(page)
            with open(filename + ".txt", "w") as final:
                final.write(output_string.getvalue())
        elif file_extension.lower() in ArqXLSX:
            import pandas as pd
            read_file = pd.read_excel(diretorio, header=None)
            read_file.to_csv(filename + ".txt", index=None, header=True)

    messagebox.showinfo(
        "Formato convertido",
        "Formato de ficheiro convertido com sucesso.\n\n" +
        file_extension[1:].upper() + " para " + destino.upper() +
        "\n\nSalvo em: " + out_file + "." + destino)
    root.destroy()
示例#6
0
def parse_webpages(webpages):
    for page in webpages:
        # obtain the robots.txt url
        r = Robots.robots_url(page)
        robots = Robots.fetch(r)
        if (robots.allowed(page, '*')):
            # sitemaps is a list of all the sitemaps for a website
            sitemaps = robots.sitemaps
            sitemaps_list = list(sitemaps)
            html = requests.get(page)  # html of the webpage
            soup = bs4.BeautifulSoup(html.text, "html.parser")
            outlinks = soup.find_all("a")  # all the outlinks
            links = [str(i.get('href')) for i in outlinks]
            outlinks = [str(i) for i in outlinks]
            docs = []  # the documents on the page

            for file in links:
                directory = page.rsplit("/", 1)[0]
                link = directory + '/' + file

                # can be expanded to other file types with a comma
                if file.endswith(('txt', 'md')):
                    if file.startswith(('http://', 'www.')):
                        text = bs4.BeautifulSoup(
                            requests.get(file).text, "html.parser")
                        ext = file.rsplit(".", 1)[-1]
                        text = [file, ext, text]
                        # text = {'link': link, 'ext': ext, 'text': text}
                        docs.append(text)
                    else:
                        text = bs4.BeautifulSoup(
                            requests.get(link).text, "html.parser")
                        ext = link.rsplit(".", 1)[-1]
                        text = [link, ext, text]
                        # text = {'link': link, 'ext': ext, 'text': text}
                        docs.append(text)
                elif file.endswith(('pdf')):  # special case if PDF
                    x = file
                    try:
                        if file.startswith(('http://', 'www.')):
                            pdf = file.rsplit("/", 1)[-1]
                            response = urlopen(file)
                        else:
                            pdf = file.rsplit("/", 1)[-1]
                            # must first check if pdf is found
                            response = urlopen(link)

                    except urllib.error.HTTPError as e:
                        # if 404 error, put 404 as text
                        text = [link, "pdf", "404"]
                        # text = {'link': link, 'ext': 'pdf', 'text': "404"}
                        docs.append(text)

                    else:
                        # otherwise must save the pdf to run pypdf2
                        file = open(pdf, 'wb')
                        file.write(response.read())
                        file.close()
                        if x.startswith('http://'):
                            link = x
                        txt = ""
                        file = open(pdf, 'rb')
                        parser = PDFParser(file)
                        document = PDFDocument(parser)
                        rsrcmgr = PDFResourceManager()
                        laparams = LAParams()
                        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                        interpreter = PDFPageInterpreter(rsrcmgr, device)
                        for p in PDFPage.create_pages(document):
                            # As the interpreter processes the page stored in PDFDocument object
                            interpreter.process_page(p)
                            # The device renders the layout from interpreter
                            layout = device.get_result()
                            # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
                            for lt_obj in layout:
                                if isinstance(lt_obj, LTTextBox) or isinstance(
                                        lt_obj, LTTextLine):
                                    txt += lt_obj.get_text()

                        # close the pdf file
                        file.close()
                        name = [link, "pdf", txt]
                        # name = {'link': link, 'ext': 'pdf', 'text': txt}
                        os.remove(pdf)  # remove the saved file when done
                        docs.append(name)

            docs = [[str(i) for i in lis] for lis in docs]
            timestamp = datetime.datetime.now().isoformat()
            output = {
                'url': page,
                'timestamp': timestamp,
                'outlinks': outlinks,
                'html': html.text,
                'docs': docs,
                'sitemaps': sitemaps_list
            }

            with Crawling_L_REST.app.app_context():
                Crawling_L_REST.add_webpage(output)

            return output
示例#7
0
def get_text_box(pdf_path):
    """
    :return: trả về list các box theo từng page ở dạng như thế này
    với region là danh sách tọa độ củ các block text
    còn media box là tọa độ size của từng page điểm x0=0, y0=0
    [
      { #page 1
        "region": [
            {
                "cordinate": [x0, y0, x1, y1]
                "text": "day la text"
             },
             {
                "cordinate": [x0, y0, x1, y1]
                "text": "day la text"
             },
             {
                "cordinate": [x0, y0, x1, y1]
                "text": "day la text"
             }
          ],
        "media_box":[
            x1, y1
          ]
      },
      { #page 2
        "region": [
                    {
                        "cordinate": [x0, y0, x1, y1]
                        "text": "day la text"
                     },
                     {
                        "cordinate": [x0, y0, x1, y1]
                        "text": "day la text"
                     },
                     {
                        "cordinate": [x0, y0, x1, y1]
                        "text": "day la text"
                     }
                  ],
                "media_box":[
                    x1, y1
                  ]
      }
    ]
    """
    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()
    device = PDFDevice(rsrcmgr)
    laparams = LAParams()

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    list_all_box = []
    # loop over all pages in the document
    for page in PDFPage.create_pages(document):
        list_item = {}
        interpreter.process_page(page)
        layout = device.get_result()
        media_texbox = (int(page.mediabox[2]), int(page.mediabox[3]))
        MEDIA_Y1 = int(page.mediabox[3])
        sub_box = parse_obj(layout._objs, MEDIA_Y1)
        list_item['region'] = sub_box
        list_item['media_box'] = media_texbox
        list_all_box.append(list_item)

    return list_all_box
示例#8
0
def main(argv):

    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    using_optparse = False

    parser = ArgumentParser(prog='pdf2txt.py',
                            description='Convert pdf to txt',
                            formatter_class=ArgumentDefaultsHelpFormatter)

    if using_optparse:
        DEBUG(3, 'using optparse')
        parser.add_argument = parser.add_option
        parser.parse_known_args = parser.parse_args
        parser.disable_interspersed_args()

    parser.add_argument('-d',
                        dest='debuglevel',
                        action='count',
                        default=0,
                        help='Debug (repeat for more verbose debugging)')

    parser.add_argument(
        '-p',
        '--pages',
        dest='pagenos',
        action='store',
        type=str,
        default='',
        help=
        'Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.'
    )

    parser.add_argument('-c',
                        '--codec',
                        dest='codec',
                        action='store',
                        type=str,
                        default='utf-8',
                        help='Specifies the output codec.')

    parser.add_argument(
        '-t',
        '--type',
        dest='outtype',
        action='store',
        type=str,
        default='shape',
        choices=['text', 'html', 'xml', 'tag', 'shape'],
        help='Specifies the output format, one of: shape, text, html, xml, tag'
    )

    parser.add_argument(
        '-m',
        dest='maxpages',
        action='store',
        type=int,
        default=0,
        help=
        'Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.'
    )

    parser.add_argument(
        '-P',
        '--password',
        dest='password',
        action='store',
        type=str,
        default='',
        help='Provides the user password to access PDF contents.')

    parser.add_argument(
        '-o',
        '--output',
        dest='outfile',
        action='store',
        type=str,
        default=None,
        help=
        'Specifies the output file name. By default, it prints the extracted contents to stdout in text format.'
    )

    parser.add_argument(
        '-C',
        '--no-caching',
        dest='caching',
        action='store_false',
        default=True,
        help=
        'Suppress object caching. This will reduce the memory consumption but also slows down the process.'
    )

    parser.add_argument('-n',
                        '--no-layout',
                        dest='layout',
                        action='store_false',
                        default=True,
                        help='Suppress layout analysis.')

    parser.add_argument('--show-pageno',
                        dest='show_pageno',
                        action='store_true',
                        default=False,
                        help='Show page numbers.')

    parser.add_argument(
        '-A',
        '--analyze-all',
        dest='all_texts',
        action='store_true',
        default=False,
        help=
        'Forces to perform layout analysis for all the text strings, including text contained in figures.'
    )

    parser.add_argument('-V',
                        '--detect-vertical',
                        dest='detect_vertical',
                        action='store_true',
                        default=False,
                        help='Allows vertical writing detection.')

    parser.add_argument(
        '-M',
        dest='char_margin',
        action='store',
        type=float,
        default=2.0,
        help=
        'Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.'
    )

    parser.add_argument(
        '-L',
        dest='line_margin',
        action='store',
        type=float,
        default=0.5,
        help=
        'Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.'
    )

    parser.add_argument(
        '-W',
        dest='word_margin',
        action='store',
        type=float,
        default=0.1,
        help=
        'It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.'
    )

    parser.add_argument(
        '-F',
        dest='boxes_flow',
        action='store',
        type=float,
        default=0.5,
        help=
        'Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).'
    )

    parser.add_argument(
        '-Y',
        '--layout-mode',
        dest='layoutmode',
        action='store',
        type=str,
        default='normal',
        choices=['exact', 'normal', 'loose'],
        help=
        'Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.'
    )

    parser.add_argument('-O',
                        '--image-writer',
                        dest='imagewriter',
                        action='store',
                        type=str,
                        default=None,
                        help='imagewriter')

    parser.add_argument('-R',
                        '--rotation',
                        dest='rotation',
                        action='store',
                        type=int,
                        default=0,
                        help='rotation')

    parser.add_argument('-S',
                        '--strip-control',
                        dest='stripcontrol',
                        action='store_true',
                        default=False,
                        help='stripcontrol')

    parser.add_argument(
        '-s',
        dest='scale',
        action='store',
        type=float,
        default=1,
        help='Specifies the output scale. Can be used in HTML format only.')

    parser.add_argument(
        '--draw-lines',
        dest='draw_lines',
        action='store_true',
        help=
        "Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output."
    )

    parser.add_argument(
        '--draw-boxes',
        dest='draw_boxes',
        action='store_true',
        help=
        "Draw crude page representation, coloured TextBoxes (= grouped text lines). Valid only for the `shape' output."
    )

    parser.add_argument(
        '--draw-blocks',
        dest='draw_blocks',
        action='store_true',
        help=
        "Draw crude page representation, coloured TextBlocks (= grouped TextBoxes). Valid only for the `shape' output."
    )

    parser.add_argument(
        '--shear-limit',
        dest='shear_limit',
        action='store',
        default=0.1,
        type=float,
        help=
        "If the text is sheared above this limit, reject it. Valid only for the `shape' output."
    )

    parser.add_argument(
        '--rotation-limit',
        dest='rotation_limit',
        action='store',
        default=2,
        type=float,
        help=
        "If the text is rotated above this angle (in degrees), reject it. Valid only for the `shape' output."
    )

    parser.add_argument(
        '--line-height-diff',
        dest='line_height_diff',
        action='store',
        type=float,
        default=0.1,
        help=
        'Two lines whose vertical sizes differ more than this ratio are not to be considered of the same paragraph (but e.g. one of them is a heading).'
    )

    parser.add_argument('--heading-before',
                        dest='heading_before',
                        action='store',
                        type=str,
                        default='',
                        help='String to put before each heading, e.g. <h1>')

    parser.add_argument('--heading-after',
                        dest='heading_after',
                        action='store',
                        type=str,
                        default='',
                        help='String to put after each heading, e.g. </h1>')

    parser.add_argument(
        '--box-separator',
        dest='box_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate boxes with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--block-separator',
        dest='block_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate blocks with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--indent-separator',
        dest='indent_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate indented lines with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--indent-string',
        dest='indent_string',
        action='store',
        type=str,
        default=r'\t',
        help=
        r'Put this string in front of indented lines. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--indent-limit',
        dest='indent_limit',
        action='store',
        type=float,
        default=3,
        help=
        'If the line is indented more then this (approximately characters), it will separated by --indent-separator from the previous one.'
    )

    parser.add_argument(
        '--page-separator',
        dest='page_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate pages with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--norm-whitespace',
        dest='norm_whitespace',
        action='store_true',
        default=False,
        help=
        'Normalize whitespace (remove duplicate spaces, replace end of lines with spaces).'
    )

    parser.add_argument(
        '--print-stats',
        dest='print_stats',
        action='store_true',
        default=False,
        help=
        'Instead of the text, output some simple statistics about the file.')

    parser.add_argument(
        '--max-blocks',
        dest='max_blocks',
        action='store',
        default=0,
        type=int,
        help=
        'If there is more than this blocks per page, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" file). 0 means no limit. 50 is maybe a good value.'
    )

    parser.add_argument(
        '--max-textlines',
        dest='max_textlines',
        action='store',
        default=0,
        type=int,
        help=
        'If there is more than this textlines per any block, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" page). 0 means no limit. 18 is maybe a good value.'
    )

    parser.add_argument(
        '--line-height-method',
        dest='line_height_method',
        action='store',
        type=str,
        default='bbox',
        choices=['bbox', 'mean', 'median'],
        help=
        'Method to calculate height of line (relevant if there are characters with uneven height). bbox takes the bounding box (rectangle encompassing the line), mean the arithmetic mean of the height of all the characters, median is the median of the height of all the characters. Use mean or median if there are outlier characters, e.g. one big character at the beginning of line.'
    )

    parser.add_argument(dest='pdffile',
                        help='List of PDF files to go through',
                        default=None,
                        nargs='+')

    args, rest = parser.parse_known_args()

    global debuglevel
    debuglevel = debug = args.debuglevel
    DEBUG(3, 'args:', str(args))
    DEBUG(3, 'rest:', str(rest))

    DEBUG(3, 'optparse:', using_optparse)

    if args.pagenos:
        pagenos.update(int(x) - 1 for x in args.pagenos.split(','))
    maxpages = args.maxpages
    outfile = args.outfile
    password = args.password
    caching = args.caching
    showpageno = args.show_pageno
    if not args.layout:
        laparams = None
    if laparams and args.all_texts:
        laparams.all_texts = True
    if laparams and args.detect_vertical:
        laparams.detect_vertical = True
    if laparams:
        laparams.char_margin = args.char_margin
        laparams.line_margin = args.line_margin
        laparams.word_margin = args.word_margin
        laparams.boxes_flow = args.boxes_flow
    layoutmode = args.layoutmode

    if args.imagewriter:
        imagewriter = ImageWriter(args.imagewriter)

    rotation = args.rotation
    stripcontrol = args.stripcontrol
    outtype = args.outtype
    codec = args.codec
    scale = args.scale

    args.box_separator = unescape_string(args.box_separator)
    args.block_separator = unescape_string(args.block_separator)
    args.indent_separator = unescape_string(args.indent_separator)
    args.indent_string = unescape_string(args.indent_string)

    args.page_separator = unescape_string(args.page_separator)

    global options
    options = args

    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug

    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
        DEBUG(2, 'output goes to', outfile)
    else:
        outfp = sys.stdout
        DEBUG(2, 'output goes to stdout')
    if outtype == 'shape':
        device = ShapeTextConverter(rsrcmgr,
                                    outfp,
                                    codec=codec,
                                    laparams=laparams,
                                    showpageno=showpageno,
                                    imagewriter=imagewriter)
    elif outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in options.pdffile:
        DEBUG(2, 'processing', fname)
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()

    outfp.close()
    DEBUG(2, 'finished.')

    return
def get_text_from_pdf(path, page_nums=None):
    r = []

    fp = open(path, 'rb')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.get_pages(fp, pagenos=page_nums)

    def parse_obj(lt_objs):
        # https://stackoverflow.com/questions/31819862/python-pdf-mining-get-position-of-text-on-every-line
        # loop over the object list

        for obj in lt_objs:
            if isinstance(obj, LTTextLine):
                x1, y1, x2, y2 = obj.bbox
                assert x1 < x2
                assert y1 < y2

                y1 = 1400 - y1
                y2 = 1400 - y2
                y1, y2 = y2, y1

                text = obj.get_text()
                width = obj.width
                height = obj.height

                text = text.replace('東久留米武蔵村山', '東久留米 武蔵村山')  # HACK!

                for line_i, line in enumerate(
                        text.split('\n')):  # CHECK WHETHER THIS IS NEEDED!
                    for word_j, word in enumerate(line.split()):
                        each_height = height / text.count('\n')
                        i_y1 = y1 + each_height * line_i
                        i_y2 = y2 + each_height * (line_i + 1)

                        each_width = width / len(line.split())
                        i_x1 = x1 + each_width * word_j
                        i_x2 = x2 + each_width * (word_j + 1)

                        r.append(
                            TextItem(text=word,
                                     x1=i_x1,
                                     y1=i_y1,
                                     x2=i_x2,
                                     y2=i_y2,
                                     width=each_width,
                                     height=each_height))

            # if it's a textbox, also recurse
            if isinstance(obj, LTTextBoxHorizontal):
                parse_obj(obj._objs)

            # if it's a container, recurse
            elif isinstance(obj, LTFigure):
                parse_obj(obj._objs)

    for page in pages:
        print('Processing next page...')
        interpreter.process_page(page)
        layout = device.get_result()

        for lobj in layout:
            if isinstance(lobj, LTTextBox):
                parse_obj(lobj)

    for xx in range(5):
        dists = []

        for x in range(len(r)):
            for y in range(len(r)):
                text_item_1 = r[x]
                text_item_2 = r[y]

                dists.append((abs(text_item_1.y1 - text_item_2.y1), x, y))

        merged = set()
        for dist, x, y in sorted(dists):
            text_item_1 = r[x]
            text_item_2 = r[y]

            text_1_num = all(i.isnumeric() or i in ',()'
                             for i in text_item_1.text.strip())
            text_2_num = all(i.isnumeric() or i in ',()'
                             for i in text_item_2.text.strip())

            if not dist:
                continue
            elif text_1_num != text_2_num:
                continue
            elif y in merged:
                continue
            merged.add(y)

            if dist <= 18:  # NOTE ME: This threshold may need to be tuned!!! =====================================
                r[y] = TextItem(text=text_item_2.text,
                                x1=text_item_2.x1,
                                y1=text_item_1.y1,
                                x2=text_item_2.x2,
                                y2=text_item_1.y1 + text_item_2.height,
                                width=text_item_2.width,
                                height=text_item_2.height)

    r.sort(key=lambda x: (x.y1, x.x1, x.x2, x.y2))
    #for i in r:
    #    print(i)
    return r
示例#10
0
from urllib.request import urlopen

# Open a PDF file.
# fp = open('./sample/sampl1.pdf', 'rb')

fp = urlopen("https://www.tencent.com/zh-cn/articles/8003261479985013.pdf")

parser = PDFParser(fp)

doc = PDFDocument()

parser.set_document(doc)
doc.set_parser(parser)

doc.initialize("")

reource = PDFResourceManager()
laparam = LAParams()

device = PDFPageAggregator(reource, laparams=laparam)

interpreter = PDFPageInterpreter(reource, device)

for page in doc.get_pages():
    interpreter.process_page(page)
    layout = device.get_result()

    for out in layout:
        if hasattr(out, "get_text"):
            print(out.get_text())
示例#11
0
def get_pdf_file_content(path_to_pdf):
    '''
    path_to_pdf: is the parameter that will give access to the PDF File 
    we want to extract the content.
    '''
    '''
    PDFResourceManager is used to store shared resources such as fonts or images that 
    we might encounter in the files. 
    '''

    resource_manager = PDFResourceManager(caching=True)
    '''
    create a string object that will contain the final text the representation of the pdf. 
    '''
    out_text = StringIO()
    '''
    UTF-8 is one of the most commonly used encodings, and Python often defaults to using it.
    In our case, we are going to specify in order to avoid some encoding errors.
    '''
    codec = 'utf-8'
    """
    LAParams is the object containing the Layout parameters with a certain default value. 
    """
    laParams = LAParams()
    '''
    Create a TextConverter Object, taking :
    - ressource_manager,
    - out_text 
    - layout parameters.
    '''
    text_converter = TextConverter(resource_manager,
                                   out_text,
                                   laparams=laParams)
    fp = open(path_to_pdf, 'rb')
    '''
    Create a PDF interpreter object taking: 
    - ressource_manager 
    - text_converter
    '''
    interpreter = PDFPageInterpreter(resource_manager, text_converter)
    '''
    We are going to process the content of each page of the original PDF File
    '''
    for page in PDFPage.get_pages(fp,
                                  pagenos=set(),
                                  maxpages=0,
                                  password="",
                                  caching=True,
                                  check_extractable=True):
        interpreter.process_page(page)
    '''
    Retrieve the entire contents of the “file” at any time 
    before the StringIO object’s close() method is called.
    '''
    text = out_text.getvalue()
    '''
    Closing all the ressources we previously opened
    '''
    fp.close()
    text_converter.close()
    out_text.close()
    '''
    Return the final variable containing all the text of the PDF
    '''
    return text
示例#12
0
parser = PDFParser(pd_file)

# print(parser)
#  pdf文档对象
document = PDFDocument(parser)
parser.set_document(document)
document.set_parser(parser)

#  初始化文档密码
document.initialize()
if document.is_extractable:
    print(True)
else:
    raise PDFTextExtractionNotAllowed
#  存储文档资源
src = PDFResourceManager()

#  设备对象
device = PDFPageAggregator(src, laparams=LAParams())

#  解释器对象

inter = PDFPageInterpreter(src, device)

pages = document.get_pages()

for page in pages:
    # print(page.contents)
    inter.process_page(page)
    layout = device.get_result()
    for x in layout:
示例#13
0
def pdf_translate(pdf_path, fgmt, make_marian_conf=None, logger=None):
    page_split_tag = '\n\n<<PAGE_SPLIT_TAG>>\n\n'
    output_string = StringIO()
    with open(pdf_path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr,
                               output_string,
                               laparams=LAParams(boxes_flow=0.3,
                                                 line_margin=1.0))
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for idx, page in enumerate(PDFPage.create_pages(doc)):
            interpreter.process_page(page)
            output_string.write(page_split_tag)
    pdf_text = output_string.getvalue()
    pdf_pages = pdf_text.split(page_split_tag)
    marian_processes = []
    if make_marian_conf:
        marian_processes = make_marian_process(
            make_marian_conf["marian_command"],
            make_marian_conf["marian_args_pdf_translator"],
            make_marian_conf["pdf_ports"])
    ret = []
    for pdf_idx, pdf_page in enumerate(pdf_pages[:-1]):
        retry_max = 3
        translated = None
        for i in range(retry_max):
            if logger:
                logger.info("translate page={}".format(pdf_idx))
            to_translate = pre_proc_text(pdf_page)
            translated = fgmt.translate_text(to_translate)
            if not fgmt.detected_marian_err:
                ret.append(translated)
                break
            else:
                translated = None
                close_marian_process(marian_processes)
                marian_processes = make_marian_process(
                    make_marian_conf["marian_command"],
                    make_marian_conf["marian_args_pdf_translator"],
                    make_marian_conf["pdf_ports"])

                fgmt.detected_marian_err = False
                if logger:
                    logger.info(fgmt.get_and_clear_logs())
                    logger.warning("recovery marian processes {}/{}".format(
                        i, retry_max - 1))
        if translated is None:
            ret.append(get_err_translated())
        marian_processes = ckeck_restart_marian_process(
            marian_processes,
            make_marian_conf["max_marian_memory"],
            make_marian_conf["marian_command"],
            make_marian_conf["marian_args_pdf_translator"],
            make_marian_conf["pdf_ports"],
            logger=logger)
        if logger:
            logger.info(fgmt.get_and_clear_logs())

    if make_marian_conf:
        close_marian_process(marian_processes)

    return ret
示例#14
0
    def get_context_data(self, **kwargs):
        context = {}
        document_obj = Document.objects.filter().order_by('-uploaded_at')[0]
        context['document_name'] = document_obj.name
        path = document_obj.document.path
        context['path'] = path
        doc_detail = DocumentDetail(document=document_obj)

        if path.endswith('.pdf'):
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(rsrcmgr,
                                   retstr,
                                   codec=codec,
                                   laparams=laparams)
            fp = open(path, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)
            text = retstr.getvalue()
            fp.close()
            device.close()
            retstr.close()
            #
            # with open("testtttttttttt.txt", "w") as text_file:
            #     text_file.write(text)

            text = text.lower()
            name = re.findall(r"patient's name:|patient:(.*)", text.lower())
            if not name:
                name = re.findall(r"re:(.*)", text.lower())
            address = self.__remove_null_from_list(
                re.findall(r"address:(.*)", text.lower()))
            dob = self.__remove_null_from_list(
                re.findall(r"dob:(.*),", text.lower()))
            if not dob:
                dob = re.findall(r"date of birth:\s*(.*)", text.lower())
            sex = re.findall(r"sex:(.*)", text.lower())
            injury = re.findall(r"consultation:(.+?)\.", text, re.DOTALL)
            if not injury:
                injury = self.__remove_null_from_list(
                    re.findall(r"injury:(.*)", text.lower()))
            date_of_surgery = re.findall(r"date of surgery:(.*)", text.lower())
            claim_no = re.findall(r"(claim #|claim#:)(.*)", text.lower())
            allergies = self.__remove_null_from_list(
                re.findall(r"allergies:(.*)", text.lower()))
            social_history = re.findall(
                r"(social history|family history|social  history :)(.*?)(?:(?:\r*\n){2})",
                text.lower(), re.DOTALL)
            if 'no past medical history on file' in text.lower():
                medical_history = ['no past medical history on file']
            else:
                medical_history = re.findall(r"medical history:(.+?)\.",
                                             text.lower(), re.DOTALL)
            impression_list = re.findall(r"(impression.*?)(?:(?:\r*\n){2})",
                                         text.lower(), re.DOTALL)
            impression = [
                i.replace('impression: ', '') for i in impression_list
            ]
            impression = list(filter(None, impression))
            doctor = re.findall(r"(doctor.|physician:)(.*)", text.lower())
            medicines = re.findall(r'mar action  action date  dose(.+?)iven',
                                   text.lower(), re.DOTALL)
            if medicines:
                medicine_list = list(filter(None, medicines[0].split('\n')))
                medicines_list = ''.join(medicine_list[3:])
            if not medicines:
                medicines = re.findall(
                    r'current  medications :(.+?)groves, steven j',
                    text.lower(), re.DOTALL)
                medicines_list = list(filter(None, medicines[0].split('\n')))
                del medicines_list[5:11]
                medicines_list = ''.join(medicines_list[1:])

            general = re.findall(r'general:(.*)', text.lower())[1]
            vital_signs = re.findall(
                r'last filed vital signs(.+?)vital\s*s~ig', text, re.DOTALL)
            if not vital_signs:
                vital_dict = {}
                vital_sign_string = ''
                vital_signss = re.findall(r'vital signs:(.+?)bilateral',
                                          text.lower(), re.DOTALL)
                vital_signss = list(filter(None, vital_signss[0].split('\n')))
                vital_dict['blood_pressure'] = vital_signss[-8].strip()
                vital_dict['pulse'] = vital_signss[-7].strip()
                vital_dict['temperature'] = vital_signss[-6].strip()
                vital_dict['respiratory_rate'] = vital_signss[-5].strip()
                vital_dict['Body Mass Index'] = vital_signss[-1].strip()
                vital_dict['Sp02'] = vital_signss[-2].strip()
                vital_dict['Weight'] = vital_signss[-3].strip()
                vital_dict['Height'] = vital_signss[-4].strip()
                for key, value in vital_dict.items():
                    vital_sign_string += key
                    vital_sign_string += " : " + value + ', '

            if vital_signs:
                vital_sign_string = ''
                vital_dict = {}
                vital_signs = vital_signs[0].split('\n')
                vital_dict['blood_pressure'] = vital_signs[4].strip().strip(
                    '·').strip()
                vital_dict['pulse'] = vital_signs[5].strip()
                vital_dict['temperature'] = vital_signs[6].strip()
                vital_dict['respiratory_rate'] = vital_signs[7].strip()
                vital_dict['oxygen_sat'] = vital_signs[8].strip()
                for key, value in vital_dict.items():
                    vital_sign_string += key
                    vital_sign_string += " : " + value + ', '
            context['name'] = ''
            context['address'] = ''
            context['dob'] = ''
            context['sex'] = ''
            context['date_of_surgery'] = ''
            context['doctor'] = ''
            if name:
                context['name'] = name[-1].replace('(cid:9)', '').strip()
                doc_detail.patient_name = context['name']
            if address:
                context['address'] = address[-1].replace('(cid:9)', '').strip()
                doc_detail.address = context['address']
            if dob:
                context['dob'] = dob[-1].replace('(cid:9)', '').strip()
                doc_detail.dob = context['dob']
            if sex:
                context['sex'] = sex[-1].replace('(cid:9)', '').strip()
                doc_detail.sex = context['sex']
            if date_of_surgery:
                context['date_of_surgery'] = date_of_surgery[-1].replace(
                    '(cid:9)', '').strip()
                doc_detail.date_of_surgery = context['date_of_surgery']
            if injury:
                context['injury'] = injury[0]
                doc_detail.injury = context['injury']
            if claim_no:
                context['claim_no'] = claim_no[0][1]
                doc_detail.claim_no = context['claim_no']
            if allergies:
                context['allergies'] = allergies[0]
                doc_detail.allergies = context['allergies']
            if social_history:
                context['social_history'] = social_history[0][1]
                doc_detail.social_history = context['social_history']
            if medical_history:
                context['medical_history'] = medical_history[0]
                doc_detail.medical_history = context['medical_history']
            if impression:
                context['impression'] = impression[0]
                doc_detail.impression = context['impression']
            if doctor:
                context['doctor'] = doctor[0][1]
                doc_detail.doctor = context['doctor']
            if vital_sign_string:
                context['vital_signs'] = vital_sign_string.strip().strip(',')
                doc_detail.vital_signs = context['vital_signs']
            if medicines_list:
                context['medicines'] = medicines_list
                doc_detail.medicines = context['medicines']
            if general:
                context['general'] = general
                doc_detail.general = context['general']
            doc_detail.save()
        return context
示例#15
0
文件: pdfm.py 项目: xuehaoca/LeetCode
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.layout import LTTextBoxHorizontal
import re
# encoding=utf8
import sys
fp = open(u'./jlpea-04-00214.pdf', 'rb')

parser = PDFParser(fp)

document = PDFDocument(parser)

rsrcmgr = PDFResourceManager(caching=False)

laparams = LAParams()

device = PDFPageAggregator(rsrcmgr, laparams=laparams)

interpreter = PDFPageInterpreter(rsrcmgr, device)

replace = re.compile(r's+')

for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    layout = device.get_result()
    for x in layout:
        if (isinstance(x, LTTextBoxHorizontal)):
            text = re.sub(replace, '', x.get_text())
示例#16
0
def parse_loske_pdf(pdf, is_ipp=True):
    stripcid_re = re.compile(u"\(cid:.*?\)", re.UNICODE)
    newline_heuristic_re = re.compile(u"Montag, den |Dienstag, den |Mittwoch" \
                                      u", den |Donnerstag, den |Freitag, den ",
                                      re.IGNORECASE | re.UNICODE)
    bnw_endheuristic_re = re.compile(u"B\.n\.W\.=Beilage.*", re.UNICODE)
    dow_beginheuristic_re = re.compile(u".*?Montag, den ",
                                       re.IGNORECASE | re.UNICODE)
    meal_detect_re = re.compile(u"(\d\.)(.*?)(\d).(\d\d)", re.UNICODE)
    #meal_detect_re = re.compile(u"(\d\.)(\D)", re.UNICODE)
    date_re = re.compile(u"(\d{1,2})\.(\d{1,2})\.(\d{1,4})(.*)", re.UNICODE)
    meal_props = re.compile(ur'\b[VKRS](?:\+[VKRS])*\b\s*', re.UNICODE)
    meal_numbers = re.compile(ur'([^/]|^)\s*\b[1-6](?:,[1-6])*\b([^/]|$)',
                              re.UNICODE)

    rsrcmgr = PDFResourceManager()
    outtxt = cStringIO.StringIO()
    device = TextConverter(rsrcmgr, outtxt)

    pdfp = PDFParser(cStringIO.StringIO(pdf))
    doc = PDFDocument()
    pdfp.set_document(doc)
    doc.set_parser(pdfp)
    doc.initialize("")

    if not doc.is_extractable:
        print >> sys.stderr, u"PDF Document not extractable"
        sys.exit(1)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for (pageno, page) in enumerate(doc.get_pages()):
        #print pageno
        interpreter.process_page(page)

    device.close()

    fulltext = outtxt.getvalue().decode('utf-8', 'replace')
    fulltext = stripcid_re.sub(u'', fulltext)
    fulltext = dow_beginheuristic_re.sub(u'', fulltext)
    fulltext = bnw_endheuristic_re.sub(u'', fulltext)
    fulltext = newline_heuristic_re.sub(u'\n', fulltext)

    lines = fulltext.split(u'\n')

    now = datetime.date(1, 1, 1)

    for line in lines:
        ret = date_re.search(line)
        if ret:
            day, month, year, meals = ret.groups()
            try:
                now = datetime.date(int(year), int(month), int(day))
            except ValueError:
                # some weird date in pdf (like 29.02.2013), skipping these
                # entries is the easiest solution
                continue
            #meals = meal_detect_re.sub(ur'\n\2(\3.\4 €)', meals).strip()
            meals = meal_detect_re.finditer(meals)
            for meal_match in meals:
                m = meal_match.group(2)
                m = meal_props.sub(u'', m)
                m = meal_numbers.sub(lambda x: x.group(1) + x.group(2), m)
                m = m.replace(u'*', u'')
                m = m.split()
                m.append(u'({0}.{1} €)'.format(meal_match.group(3),
                                               meal_match.group(4)))
                m = u' '.join(m)
                meal_type = TYPE_IPP if is_ipp else TYPE_FMI
                try:
                    tmp = config["meals"][now]
                    config["meals"][now].append((meal_type, m))
                except KeyError, e:
                    config["meals"][now] = [(meal_type, m)]
示例#17
0
def Converting_Function(Path_To_TXTs, new_file):
    """
    :param Path_To_TXTs: path to PDFs or/and XML files
    :param new_file: the path to save the TXT format
    """
    files_short = np.array([
        f for f in os.listdir(Path_To_TXTs)
        if os.path.isfile(os.path.join(Path_To_TXTs, f))
    ])
    files = np.array([Path_To_TXTs + '/' + f for f in files_short])
    for file in files:
        if file.endswith('.pdf'):
            Not_Good = False
            Prob = False
            try:
                fp = open(file, 'rb')
                parser_pdf = PDFParser(fp)
                doc = PDFDocument(parser_pdf)
                rsrcmgr = PDFResourceManager()
                laparams = LAParams()
                device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                for page in PDFPage.create_pages(doc):
                    interpreter.process_page(page)
                    device.get_result()
                rows = device.rows
                lines = [item[5] for item in rows]
                if average_len(lines) >= 20:
                    try:
                        text_all = convert_pdf_to_txt(file, pages=[0])
                        rows_pages = [item for item in rows if item[0] != 0]
                        words = [item[1] for item in rows_pages]
                        words_1 = [item for item in words if item <= 200]
                        words_2 = [item for item in words if item > 200]
                        first = most_common(words_1)
                        second = most_common(words_2)
                        pages = [item[0] for item in rows_pages]
                        pages = list(set(pages))
                        pages.sort()
                        for page in pages:
                            page_lines = [
                                line for line in rows_pages if line[0] == page
                            ]
                            text1 = ''
                            text2 = ''
                            text_middle = ''
                            for item in page_lines:
                                if item[1] <= (first + 20) and not (
                                        item[5].isdigit()
                                        and not item[5].endswith('.')):
                                    text1 = text1 + '\n' + item[5]
                                elif item[1] >= (
                                        second -
                                        20) and item[1] <= 500 and not (
                                            item[5].isdigit()
                                            and not item[5].endswith('.')):
                                    text2 = text2 + '\n' + item[5]
                                else:
                                    if not (item[5].isdigit()
                                            and not item[5].endswith('.')):
                                        text_middle = text_middle + '\n' + item[
                                            5]
                            if len(text1 + text2) > len(text_middle):
                                text_all = text_all + text1 + text_middle + text2
                            else:
                                Not_Good = True
                        if len(text_all) >= 1500 and Not_Good == False:
                            text_all = text_all.replace(' ac.', '~').replace(
                                ' a.c.', '~').replace(' a.c', '~')
                            name = file.split('/')[-1][:-4]
                            path = new_file + '/' + name + '.txt'
                            with open(path, 'w', encoding='utf8') as f:
                                f.write(text_all)
                                f.close()
                            #print('Article ', name, ' is successfully converted')
                        elif len(text_all) >= 1500 and Not_Good == True:
                            rawText = parser.from_file(file)
                            text = rawText['content']
                            text = os.linesep.join(
                                [s for s in text.splitlines() if s])
                            text_all = text.replace(' ac.', '~').replace(
                                ' a.c.', '~').replace(' a.c', '~')
                            text_all = " ".join(text_all.split())
                            name = file.split('/')[-1][:-4]
                            path = new_file + '/' + name + '.txt'
                            with open(path, 'w', encoding='utf8') as f:
                                f.write(text_all)
                                f.close()
                            #print('Article ', name, ' is successfully converted')
                        else:
                            raw = parser.from_file(file)
                            text_all = raw['content']
                            text_all = "\n".join([
                                ll.rstrip() for ll in text_all.splitlines()
                                if ll.strip()
                            ])
                            if len(text_all) >= 1500:
                                text_all = text_all.replace(
                                    ' ac.',
                                    '~').replace(' a.c.',
                                                 '~').replace(' a.c', '~')
                                name = file.split('/')[-1][:-4]
                                path = new_file + '/' + name + '.txt'
                                with open(path, 'w', encoding='utf8') as f:
                                    f.write(text_all)
                                    f.close()
                                #print('Article ', name, ' is successfully converted')
                            else:
                                pass
                                #print('The PDF "' + file + '" contain less than 1500 characters !!!')
                    except:
                        Prob = True
                elif average_len(lines) < 20 or Prob == True:
                    raw = parser.from_file(file)
                    text_all = raw['content']
                    text_all = "\n".join([
                        ll.rstrip() for ll in text_all.splitlines()
                        if ll.strip()
                    ])
                    if len(text_all) >= 1500:
                        text_all = text_all.replace(' ac.', '~').replace(
                            ' a.c.', '~').replace(' a.c', '~')
                        name = file.split('/')[-1][:-4]
                        path = new_file + '/' + name + '.txt'
                        with open(path, 'w', encoding='utf8') as f:
                            f.write(text_all)
                            f.close()
                        #print('Article ', name, ' is successfully converted')
                    else:
                        pass
                        #print('The PDF "' + file + '" contain less than 1500 characters !!!')
            except:
                Prob = True
            if Prob == True:
                raw = parser.from_file(file)
                text_all = raw['content']
                text_all = "\n".join([
                    ll.rstrip() for ll in text_all.splitlines() if ll.strip()
                ])
                if len(text_all) >= 1500:
                    text_all = text_all.replace(' ac.', '~').replace(
                        ' a.c.', '~').replace(' a.c', '~')
                    name = file.split('/')[-1][:-4]
                    path = new_file + '/' + name + '.txt'
                    with open(path, 'w', encoding='utf8') as f:
                        f.write(text_all)
                        f.close()
                    #print('Article ', name, ' is successfully converted')
                else:
                    pass
                    #print('The PDF "' + file + '" contain less than 1500 characters !!!')
        elif file.endswith('.xml'):
            text_all = get_text_from_XML_without_saving(file)
            text_all = text_all.split('competing financial interest')[0]
            text_all = text_all.replace(' ac.',
                                        '~').replace(' a.c.',
                                                     '~').replace(' a.c', '~')
            name = file.split('/')[-1][:-4]
            path = new_file + '/' + name + '.txt'
            with open(path, 'w', encoding='utf8') as f:
                f.write(text_all)
                f.close()
示例#18
0
def get_text(url, parse=True, laparams=laparams):
    url += 'v1.full.pdf'
    max_attempts = 4
    attempts = 0
    print(url)
    while attempts < max_attempts:
        r = requests.get(url)
        if r.status_code != 429:
            break
        # If rate limited, wait and try again (in seconds)
        time.sleep((2**attempts) + random.random())
        attempts = attempts + 1
    data = r.content

    try:
        f = io.BytesIO(data)

        rsrcmgr = PDFResourceManager()
        retstr = BytesIO()
        codec = 'utf-8'
        device = XMLConverter(rsrcmgr, retstr, codec=codec,
                              laparams=laparams)  # , rect_colors=rect_colors)

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0  # is for all
        caching = True
        pagenos = set()

        for page in PDFPage.get_pages(f,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            interpreter.process_page(page)
        device.close()
        pdf_data = retstr.getvalue()
        retstr.close()
    except:
        return ('.raw.txt', data)
    try:
        if parse == False:
            return ('.xml', pdf_data)
        else:

            # xmltest = convert_pdf_to_xml(pdf_data)
            root = ET.fromstring(pdf_data)

            temp = root.find('.//text')
            curr_font = temp.get('font')
            curr_size = float(temp.get('size'))
            text = ''

            rmargin = 70

            i = 0
            newline_pos = []
            for l in root.iterfind('.//textline'):
                for t in l.findall('./text'):
                    if (t.get('font') or t.get('size')) is None:
                        if t.text[0] == ' ':
                            text += ' '
                        else:
                            text += '<<NEWLINE>>'
                            newline_pos.append([])

                    else:
                        x0, y0, x1, y1 = [
                            float(z) for z in t.get('bbox').split(',')
                        ]
                        char_size = float(t.get('size', 0))
                        char_font = t.get('font', '')
                        if y0 > 750 or y0 < 75:
                            continue
                        if x0 < rmargin:
                            if re.search('[A-Za-z]+', t.text) is not None:
                                print('changing rmargin to ', str(x0 - 1))
                                rmargin = x0 - 1
                                text += t.text
                            continue

                        else:
                            if (char_size != curr_size) or (char_font !=
                                                            curr_font):
                                if (char_size) <= 8.:
                                    continue
                                text += '<<NEWFONT>>' + t.text
                                curr_font = t.get('font')
                                curr_size = float(t.get('size'))
                            else:
                                text += t.text
            lines = text.split('<<NEWLINE>>')
            [print(l) for l in lines[:min(len(lines), 5)]]

            doc = lines[0]
            open_parens = False
            parens = []

            if len(re.findall(r'\(', doc)) > len(re.findall(r'\)', doc)):
                parens.append(True)
            else:
                parens.append(False)
            for i, t in enumerate(lines):

                if (i == 0):
                    if re.search(r'^\s*[a-z(]', lines[1]) is None:
                        doc += '\n'
                    continue
                if len(t) < 1:
                    if open_parens == False:
                        doc += '\n'
                    else:
                        continue
                    continue
                else:
                    o = len(re.findall(r'\(', t))
                    if open_parens == True:
                        o += 1
                    c = len(re.findall(r'\)', t))
                    if o > c:
                        open_parens = True
                    else:
                        open_parens = False

                    if open_parens == False:
                        if t.startswith(' '):
                            t = re.sub(r'^ +', '<<PARAGRAPH>>', t)
                        if t.lstrip(' ').startswith('<<NEWFONT>>') and lines[
                                i - 1].rstrip(' ').endswith('.'):
                            t = re.sub(r'^<<NEWFONT>>', '<<PARAGRAPH>>',
                                       t.lstrip(' '))
                        if t.rstrip(' ').endswith('.'):
                            t += '<<PARAGRAPH>>'
                        if re.match(r'^\d{1,3}\.<<NEWFONT>>', t):
                            t = '<<PARAGRAPH>>' + t
                    doc += t
                    parens.append(open_parens)
            doc = re.sub(r'(?<=[^.])\n+', '', doc)
            doc = re.sub(r' {3,}', '<<PARAGRAPH>>', doc)
            print(doc[:50])

            parsed = []
            for _text in doc.split(r'<<PARAGRAPH>>'):
                _text = re.sub('(<<NEWLINE>>)+', '\n', _text)
                _text = re.sub(r'  ', r'\n', _text)
                _text = re.sub(
                    r'<<NEWFONT>>(?P<url>http[a-zA-Z0-9./+?_=:-]+)( <<NEWFONT>>)?',
                    r'\g<url>', _text)
                _text = re.sub(r'<<NEWFONT>> <<NEWFONT>>', r' ', _text)
                _text = re.sub(r'\(<<NEWFONT>>(.+)<<NEWFONT>>\)', r'(\g<1>)',
                               _text, re.M)

                pattern = re.compile(
                    r'<<NEWFONT>>(((\W|\d)+)|([A-Za-z_-]{1,2}\n?))<<NEWFONT>>')

                _text = pattern.sub(r'\g<1>', _text)

                pat2 = re.compile(
                    r'<<NEWFONT>>([A-Za-z- :]+)<<NEWFONT>>([.:]?)')
                _text = pat2.sub(r'\g<1>\g<2>\n', _text)
                pat3 = re.compile(
                    r'<<NEWFONT>>([A-Za-z_-]{1,3} *\n?)<<NEWFONT>>')

                _text = pat3.sub(r'\g<1>', _text)

                _text = re.sub(r'<<NEWFONT>>(.+)<<NEWFONT>>([a-z]+)',
                               r'\g<1> \g<2>', _text)
                _text = re.sub(r'<<NEWFONT>>(.+)<<NEWFONT>>(\W*)\.?',
                               r'\g<1> \g<2>', _text)
                _text = re.sub(r'-\n', r'-', _text)
                _text = re.sub(r'\((.+)(?:\n)(.+)\)', r'(\g<1>\g<2>))', _text)
                _text = re.sub(r'\((.+)<<NEWFONT>>(.+)\)', r'(\g<1>\g<2>)',
                               _text, re.M)

                if len(_text.strip(' \n')) > 0:
                    if len(re.findall(r'<<NEWFONT>>', _text)) == 1:
                        _text = re.sub(r'<<NEWFONT>>', '\n', _text)
                    parsed.append(_text)

            parsed2 = [
                parsed[0],
            ]
            for i, p in enumerate(parsed):
                if i > 0:
                    if re.search(r'^\s*\n*[a-z]', p) is not None:
                        parsed2[i - 1] += p
                        p = ''
                    parsed2.append(p)
            parsed2 = '\n===================================\n'.join(
                [p for p in parsed2 if p != ''])

            print(parsed2[:50])
            return ('.txt', parsed2)
    except:
        return ('.raw.xml', pdf_data)
示例#19
0
def read_pdf(filename):
    results = []

    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    print(u'定位并解析数据')
    content = ""

    try:
        outlines = document.get_outlines()
        for (level, title, dest, a, se) in outlines:
            print(level, title)
    except PDFNoOutlines:
        print(u'没有大纲')

    #  创建一个PDF资源管理器对象来存储共赏资源
    rsrcmgr = PDFResourceManager()
    # 设定参数进行分析
    laparams = LAParams()
    # 创建一个PDF设备对象
    # device=PDFDevice(rsrcmgr)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # 创建一个PDF解释器对象
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    current_section = ""  # use for save tag
    last_section = ""  # use for save title
    current_char_size = 0
    current_fontname = ""
    debug_line = False
    debug_section = False
    debug_title = True
    # 处理每一页
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        # 接受该页面的LTPage对象
        layout = device.get_result()
        for listItem in layout:
            if (isinstance(listItem, LTTextBox)):
                # print("LTTextBox: %s" % listItem.get_text().encode('utf-8')+'\n')
                for textLine in listItem:
                    if (isinstance(textLine, LTTextLine)):
                        if debug_line:
                            if (len(textLine.get_text().strip())):
                                print("LTTextLine: %s" %
                                      textLine.get_text().encode('utf-8') +
                                      '\n')
                        for char in textLine:
                            if (isinstance(char, LTChar)):
                                #if(len(char.get_text().strip()) > 0):
                                #    print(" LTChar: %s size:%s font: %s " %  (char,char.size,char.fontname))

                                # new section ?
                                if (abs(char.size - current_char_size) >
                                        0.00001
                                        and len(current_section.strip()) > 0
                                        or current_fontname != char.fontname):
                                    #print("    size chang from : %s  to: %s %s" % (char.size, current_char_size,char))
                                    try:
                                        if debug_section:
                                            print(u"current_section : %s" %
                                                  (current_section))
                                            print(u"last_section : %s" %
                                                  (last_section))
                                    except UnicodeEncodeError:
                                        clr.print_red_text(
                                            "UnicodeEncodeError")

                                    # print("section: %s : %f - %f" %  (current_section,current_char_size,char.size))
                                    # print(" %s - %s " %  (type(current_char_size),type(char.size)))
                                    current_section = current_section.strip()

                                    if current_section.startswith(
                                            "<KPOC-REQ"
                                    ) or current_section.startswith(
                                            "KPOC-REQ"):
                                        # this is req tag
                                        if current_section.startswith(
                                                "KPOC-REQ"):
                                            current_section = "<" + current_section
                                        if current_section.count(
                                                ".......") > 0:
                                            # index , new result
                                            result = {
                                                "Section": "",
                                                'Title': None,
                                                'ReqId': None,
                                                'Content': "",
                                            }

                                            ReqId_end = current_section.find(
                                                '>')
                                            Title_end = current_section.find(
                                                '............')
                                            ReqId = current_section[
                                                0:ReqId_end + 1]
                                            Title = current_section[
                                                ReqId_end + 1:Title_end]
                                            Section = last_section
                                            result["Title"] = Title.strip()
                                            result["ReqId"] = ReqId.strip()
                                            result["Section"] = Section.strip()
                                            results.append(
                                                result)  # save the result
                                            if debug_title:
                                                print(u"Text: %s" %
                                                      (current_section))
                                                print(u"Section: %s" %
                                                      (Section))
                                                print(u"ReqId: %s" % (ReqId))
                                                print(u"Title: %s" % (Title))
                                                print(u"\n")
                                        else:
                                            # not index , old result
                                            ReqId_end = current_section.find(
                                                '>')
                                            ReqId = current_section[
                                                0:ReqId_end + 1]
                                            for result in results:
                                                if result["ReqId"] == ReqId:
                                                    Content_Begin = current_section.find(
                                                        '> ') + 2
                                                    result[
                                                        "Content"] = current_section[
                                                            Content_Begin:]
                                                    #exit(0)

                                        result = None
                                        # clean result
                                        current_section = ""  # remove the title from next content
                                        if debug_section:
                                            print(
                                                u"current_section clear for ReqId"
                                            )
                                    else:
                                        # endif with current_section.startswith
                                        # after all , when a new section is found , we should clean this
                                        if (len(current_section.strip())):
                                            last_section = current_section  # section number maybe here
                                        current_section = ""
                                        if debug_section:
                                            print(
                                                u"current_section clear for Not Found ReqId"
                                            )
                                            #print(u"last_section : %s" % (last_section))

                                # same section
                                current_section = current_section + char.get_text(
                                )
                                current_char_size = char.size
                                current_fontname = char.fontname

    #save the last para
    if len(results) > 0:  # skip the first one
        content = content.strip()
        results[-1]["Content"] = content  # save the last one content
    content = ""  # empty the content

    fp.close()

    print("count %d" % len(results))
    return results
def main(argv):
    import getopt

    def usage():
        print(
            f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]'
            ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]'
            ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]'
            ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]'
            ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...')
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = b''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-P': password = v.encode('ascii')
        elif k == '-o': outfile = v
        elif k == '-t': outtype = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-c': encoding = v
        elif k == '-s': scale = float(v)
        elif k == '-R': rotation = int(v)
        elif k == '-Y': layoutmode = v
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-S': stripcontrol = True
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = open(outfile, 'w', encoding=encoding)
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        with open(fname, 'rb') as fp:
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)
    device.close()
    outfp.close()
    return
def convert_to_text(fname):
    pages = None
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close

    text_list = text.split('\n')
    txt = text_list[:3]
    text = ' '.join(text_list[3:])

    print("###################")
    print(txt)

    ## spliting word from string

    word_list = text.split(' ')
    string_input = ""
    flag = 0
    for word in word_list:
        # print("*********")
        # print(word)

        if (word.lower() == 'tran'):
            break
        else:
            if (word.lower() == 'customer' or word.lower() == 'scheme'
                    or word.lower() == 'currency' or word.lower() == 'for'):
                word = '\n' + word

            elif (word.lower() == 'statement'):
                word = '\n' + word
                flag = 1
            elif (word.lower() == 'account' and flag == 1):
                word = '\n' + word

        string_input += word + " "
    print("::::::::::::::::::::::")
    # print(string_input)

    file_name = fname.split('/')[-1]
    file_name = file_name.split('.')[0]
    # print(file_name)

    # write Content to .txt
    text_file = open(
        "/home/dell/Documents/Aha-Loans/Production/PdfAnalyser/output/txt/output_"
        + file_name + ".txt", "w")
    text = re.sub("\s\s+", " ", text)

    text_file.write("%s" % text)
    text_file.close()
    file_name_main = "output_" + file_name + ".csv"
    csv_file = open(
        "/home/dell/Documents/Aha-Loans/Production/PdfAnalyser/output/csv/" +
        file_name_main, "w")
    text = re.sub("\s\s+", " ", string_input)
    csv_file.write("%s" % string_input)
    csv_file.close()
    length_lines = len(string_input.split('\n'))
    # print("-----------",length_lines)
    convert_to_table(fname, string_input, txt)
示例#22
0
def parse():
    # 以二进制读模式打开
    fb = open(path, 'rb')
    # 创建一个pdf文档分析器
    parser = PDFParser(fb)
    # 创建一个pdf文档对象
    doc = PDFDocument()

    # 连接分析器与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码
    # 如果没有密码,就创建一个空字符串
    doc.initialize()
    obj = {}
    amount = 0
    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # pdf 资源管理器,来管理共享资源
        resource = PDFResourceManager()
        # 参数分析器
        laparam = LAParams()
        # 聚合器
        device = PDFPageAggregator(resource, laparams=laparam)
        # 创建PDF解释器
        interpreter = PDFPageInterpreter(resource, device)

        # 循环遍历列表,每次处理一个page的内容 doc.get_pages() 获取page列表
        for index, page in enumerate(doc.get_pages()):
            # if index < 3:
            #     continue
            # if index == 4:
            #     break

            # 使用页面解释器来读取
            interpreter.process_page(page)
            # 使用聚合器获取内容
            layout = device.get_result()

            for out in layout:
                if hasattr(out, "get_text"):
                    # print(out.get_text())
                    # 去除无法识别的文字转化成的 (cid:12) 之类的代码
                    t = re.sub(r'\(cid:[\d]*\)', '', out.get_text())
                    # 去除特殊内容,如数字、's、'm、're、n't
                    tx = re.sub(r'(\d+|\'s|\'m|\'re|n\'t)', '', t)
                    # 去除标点符号,且将多个空格转化为一个空格
                    txt = re.sub(
                        r'[\s+\?\.\!\/_,`:;\-$%^*\[\]\{\})(+\"\']+|[+——!,。?、‘’“”~@#¥%……&*():]+',
                        ' ', tx)
                    for word in txt.split():
                        # 跳过非英语单词
                        if not is_english(word):
                            continue
                        # 将单词转化为小写
                        w = word.lower()
                        amount = amount + 1
                        if obj.__contains__(w):
                            obj[w] = obj[w] + 1
                        else:
                            obj[w] = 1

    db = connect()
    # 获取会话指针
    cursor = db.cursor()

    # 创建表
    cursor.execute('CREATE TABLE IF NOT EXISTS ' + tablename +
                   '(word varchar(255) NOT NULL, ' +
                   'count int NOT NULL, probability float NOT NULL, ' +
                   'PRIMARY KEY (word))')

    # 清空 words 表,避免受前一次计算结果影响
    cursor.execute('truncate table ' + tablename)
    for key in obj:
        # 创建一条sql语句
        sql = 'REPLACE INTO ' + tablename + ' (word, count, probability) VALUES(%s, %s, %s)'
        # 执行sql语句
        cursor.execute(sql,
                       (key, obj[key], round(obj[key] / amount * 10000, 2)))
        # 提交
        db.commit()

    # 断开数据库连接
    db.close()
    print("总词数: %s" % amount)
示例#23
0
def cas_pdf_to_text(filename: Union[str, io.IOBase],
                    password) -> PartialCASData:
    """
    Parse CAS pdf and returns line data.

    :param filename: CAS pdf file (CAMS or Kfintech)
    :param password: CAS pdf password
    :return: array of lines from the CAS.
    """
    file_type: Optional[FileType] = None

    if isinstance(filename, str):
        fp = open(filename, "rb")
    elif hasattr(filename, "read") and hasattr(filename,
                                               "close"):  # file-like object
        fp = filename
    else:
        raise CASParseError(
            "Invalid input. filename should be a string or a file like object")

    with fp:
        pdf_parser = PDFParser(fp)
        try:
            document = PDFDocument(pdf_parser, password=password)
        except PDFPasswordIncorrect:
            raise IncorrectPasswordError("Incorrect PDF password!")
        except PDFSyntaxError:
            raise CASParseError("Unhandled error while opening file")

        line_margin = {
            FileType.KFINTECH: 0.1,
            FileType.CAMS: 0.2
        }.get(detect_pdf_source(document), 0.2)

        rsrc_mgr = PDFResourceManager()
        laparams = LAParams(line_margin=line_margin, detect_vertical=True)
        device = PDFPageAggregator(rsrc_mgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrc_mgr, device)

        pages: List[Iterator[LTTextBoxHorizontal]] = []

        investor_info = None
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            text_elements = filter(
                lambda x: isinstance(x, LTTextBoxHorizontal), layout)
            if file_type is None:
                for el in filter(lambda x: isinstance(x, LTTextBoxVertical),
                                 layout):
                    if re.search("CAMSCASWS", el.get_text()):
                        file_type = FileType.CAMS
                    if re.search("KFINCASWS", el.get_text()):
                        file_type = FileType.KFINTECH
            if investor_info is None:
                investor_info = parse_investor_info(layout, *page.mediabox[2:])
            pages.append(text_elements)

        lines = group_similar_rows(pages)
        return PartialCASData(file_type=file_type,
                              investor_info=investor_info,
                              lines=lines)
示例#24
0
    def __init__(
        self,
        file,
        merge_tags=('LTChar', 'LTAnno'),
        round_floats=True,
        round_digits=3,
        input_text_formatter=None,
        normalize_spaces=True,
        resort=True,
        parse_tree_cacher=None,
        laparams={
            'all_texts': True,
            'detect_vertical': True
        },
    ):
        # store input
        self.merge_tags = merge_tags
        self.round_floats = round_floats
        self.round_digits = round_digits
        self.resort = resort

        # set up input text formatting function, if any
        if input_text_formatter:
            self.input_text_formatter = input_text_formatter
        elif normalize_spaces:
            r = re.compile(r'\s+')
            self.input_text_formatter = lambda s: re.sub(r, ' ', s)
        else:
            self.input_text_formatter = None

        # open doc
        if not hasattr(file, 'read'):
            try:
                file = open(file, 'rb')
            except TypeError:
                raise TypeError("File must be file object or filepath string.")

        parser = PDFParser(file)
        if hasattr(QPDFDocument, 'set_parser'):
            # pdfminer < 20131022
            doc = QPDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
        else:
            # pdfminer >= 20131022
            doc = QPDFDocument(parser)
            parser.set_document(doc)
        if hasattr(doc, 'initialize'):
            # as of pdfminer==20140328, "PDFDocument.initialize() method is
            # removed and no longer needed."
            doc.initialize()
        self.doc = doc
        self.parser = parser
        self.tree = None
        self.pq = None
        self.file = file

        if parse_tree_cacher:
            self._parse_tree_cacher = parse_tree_cacher
            self._parse_tree_cacher.set_hash_key(self.file)
        else:
            self._parse_tree_cacher = DummyCache()

        # set up layout parsing
        rsrcmgr = PDFResourceManager()
        if type(laparams) == dict:
            laparams = LAParams(**laparams)
        self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)

        # caches
        self._pages = []
        self._pages_iter = None
        self._elements = []
示例#25
0
# Open a PDF file.
fp = open(f'{file_path}/{input_filename}', 'rb')

# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)

# Create a PDF document object that stores the document structure.
document = PDFDocument(parser)

# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed

# Create a PDF resource manager object that stores shared resources.
resource_manager = PDFResourceManager()

# Create a PDF device object.
device = PDFDevice(resource_manager)

# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(resource_manager, device)

# Process each page contained in the document.
pdf_text = ''
for page in PDFPage.get_pages(fp, set(), 0, document):

    # convert hex to data
    interpreter.process_page(page)
    page_data = page.contents[0].data
    if page_data:
示例#26
0
def pdf_to_csv(filename):
    from cStringIO import StringIO
    from pdfminer.converter import LTChar, TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfpage import PDFPage

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)

        def end_page(self, i):
            from collections import defaultdict

            lines = defaultdict(lambda: {})
            for child in self.cur_item._objs:  # <-- changed
                if isinstance(child, LTChar):
                    (_, _, x, y) = child.bbox
                    line = lines[int(-y)]
                    line[x] = child._text.encode(self.codec)  #<-- changed

            for y in sorted(lines.keys()):
                line = lines[y]
                self.outfp.write(";".join(line[x]
                                          for x in sorted(line.keys())))
                self.outfp.write("\n")

    # ... the following part of the code is a remix of the
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
    # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    parser.set_document(doc)
    # doc.set_parser(parser)
    # doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)
    pagenos = set()
    rotation = 0
    i = 1
    for page in PDFPage.get_pages(fp, pagenos):
        page.rotate = (page.rotate + rotation) % 360
        outfp.write("START PAGE %d\n" % i)
        interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)
        i += 1

    # for i, page in enumerate(doc.get_pages()):
    # outfp.write("START PAGE %d\n" % i)
    #     if page is not None:
    #         interpreter.process_page(page)
    #     outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()
    return outfp.getvalue()
示例#27
0
文件: pdf.py 项目: guix77/weboob
def get_pdf_rows(data, miner_layout=True):
    """
    Takes PDF file content as string and yield table row data for each page.

    For each page in the PDF, the function yields a list of rows.
    Each row is a list of cells. Each cell is a list of strings present in the cell.
    Note that the rows may belong to different tables.

    There are no logic tables in PDF format, so this parses PDF drawing instructions
    and tries to find rectangles and arrange them in rows, then arrange text in
    the rectangles.

    External dependencies:
    PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html).
    """

    try:
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
    except ImportError:
        raise ImportError('Please install python-pdfminer')

    try:
        from pdfminer.pdfdocument import PDFDocument
        from pdfminer.pdfpage import PDFPage
        newapi = True
    except ImportError:
        from pdfminer.pdfparser import PDFDocument
        newapi = False
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar, LTCurve

    parser = PDFParser(BytesIO(data))
    try:
        if newapi:
            doc = PDFDocument(parser)
        else:
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
    except PDFSyntaxError:
        return

    rsrcmgr = PDFResourceManager()
    if miner_layout:
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    else:
        device = PDFPageAggregator(rsrcmgr)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    if newapi:
        pages = PDFPage.get_pages(BytesIO(data), check_extractable=True)
    else:
        doc.initialize()
        pages = doc.get_pages()

    if LOGGER.isEnabledFor(DEBUGFILES):
        import tempfile
        import PIL.Image as Image
        import PIL.ImageDraw as ImageDraw
        import random

        path = tempfile.mkdtemp(prefix='pdf')

    for npage, page in enumerate(pages):
        LOGGER.debug('processing page %s', npage)
        interpreter.process_page(page)
        page_layout = device.get_result()

        texts = sum([
            list(lttext_to_multilines(obj, page_layout))
            for obj in page_layout._objs
            if isinstance(obj, (LTTextBox, LTTextLine, LTChar))
        ], [])
        LOGGER.debug('found %d text objects', len(texts))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB',
                            (int(page.mediabox[2]), int(page.mediabox[3])),
                            (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for t in texts:
                color = (random.randint(127, 255), random.randint(127, 255),
                         random.randint(127, 255))
                draw.rectangle((t.x0, t.y0, t.x1, t.y1), outline=color)
                draw.text((t.x0, t.y0), t.text.encode('utf-8'), color)
            fpath = '%s/1text-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        if not miner_layout:
            texts.sort(key=lambda t: (t.y0, t.x0))

        # TODO filter ltcurves that are not lines?
        # TODO convert rects to 4 lines?
        lines = [
            lt_to_coords(obj, page_layout) for obj in page_layout._objs
            if isinstance(obj, (LTRect, LTLine, LTCurve))
        ]
        LOGGER.debug('found %d lines', len(lines))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB',
                            (int(page.mediabox[2]), int(page.mediabox[3])),
                            (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for l in lines:
                color = (random.randint(127, 255), random.randint(127, 255),
                         random.randint(127, 255))
                draw.rectangle((l.x0, l.y0, l.x1, l.y1), outline=color)
            fpath = '%s/2lines-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        lines = list(uniq_lines(lines))
        LOGGER.debug('found %d unique lines', len(lines))

        rows = build_rows(lines)
        LOGGER.debug('built %d rows (%d boxes)', len(rows),
                     sum(len(row) for row in rows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB',
                            (int(page.mediabox[2]), int(page.mediabox[3])),
                            (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for r in rows:
                for b in r:
                    color = (random.randint(127,
                                            255), random.randint(127, 255),
                             random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1),
                                   outline=color)
            fpath = '%s/3rows-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        textrows = arrange_texts_in_rows(rows, texts)
        LOGGER.debug('assigned %d strings',
                     sum(sum(len(c) for c in r) for r in textrows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB',
                            (int(page.mediabox[2]), int(page.mediabox[3])),
                            (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for row, trow in zip(rows, textrows):
                for b, tlines in zip(row, trow):
                    color = (random.randint(127,
                                            255), random.randint(127, 255),
                             random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1),
                                   outline=color)
                    draw.text((b.x0 + 1, b.y0 + 1),
                              '\n'.join(tlines).encode('utf-8'), color)
            fpath = '%s/4cells-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        yield textrows
    device.close()
示例#28
0
    def __init__(self, pdf_stream, password='', pagenos=[], maxpages=0):
        ReaderBackend.__init__(self)
        self.pdf_stream = pdf_stream

        # Extract Metadata
        parser = PDFParser(pdf_stream)
        doc = PDFDocument(parser, password=password, caching=True)
        if doc.info:
            for k in doc.info[0]:
                v = doc.info[0][k]
                # print(repr(v), type(v))
                if isinstance(v, (bytes, str, unicode)):
                    self.metadata[k] = make_compat_str(v)
                elif isinstance(v, (psparser.PSLiteral, psparser.PSKeyword)):
                    self.metadata[k] = make_compat_str(v.name)

        # Secret Metadata
        if 'Metadata' in doc.catalog:
            metadata = resolve1(doc.catalog['Metadata']).get_data()
            # print(metadata)  # The raw XMP metadata
            # print(xmp_to_dict(metadata))
            self.metadata.update(xmp_to_dict(metadata))
            # print("---")

        # Extract Content
        text_io = BytesIO()
        rsrcmgr = PDFResourceManager(caching=True)
        converter = TextConverter(rsrcmgr,
                                  text_io,
                                  codec="utf-8",
                                  laparams=LAParams(),
                                  imagewriter=None)
        interpreter = PDFPageInterpreter(rsrcmgr, converter)

        self.metadata["Pages"] = 0
        self.curpage = 0
        for page in PDFPage.get_pages(self.pdf_stream,
                                      pagenos=pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=True,
                                      check_extractable=False):
            # Read page contents
            interpreter.process_page(page)
            self.metadata["Pages"] += 1
            self.curpage += 1

            # Collect URL annotations
            # try:
            if page.annots:
                refs = self.resolve_PDFObjRef(page.annots)
                if refs:
                    if isinstance(refs, list):
                        for ref in refs:
                            if ref:
                                self.references.add(ref)
                    elif isinstance(refs, Reference):
                        self.references.add(refs)

            # except Exception as e:
            # logger.warning(str(e))

        # Remove empty metadata entries
        self.metadata_cleanup()

        # Get text from stream
        self.text = text_io.getvalue().decode("utf-8")
        text_io.close()
        converter.close()
        # print(self.text)

        # Extract URL references from text
        for url in extractor.extract_urls(self.text):
            self.references.add(Reference(url, self.curpage))

        for ref in extractor.extract_arxiv(self.text):
            self.references.add(Reference(ref, self.curpage))

        for ref in extractor.extract_doi(self.text):
            self.references.add(Reference(ref, self.curpage))
示例#29
0
def main(argv):
    def usage():
        print((
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
            '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
            '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...'
            % argv[0]))
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    debug = False
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug = True
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)

    if debug:
        set_debug_logging()
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
        close_outfp = True
    else:
        outfp = sys.stdout
        close_outfp = False
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               outdir=outdir,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        fp = io.open(fname, 'rb')
        process_pdf(rsrcmgr,
                    device,
                    fp,
                    pagenos,
                    maxpages=maxpages,
                    password=password,
                    caching=caching,
                    check_extractable=True)
        fp.close()
    device.close()
    if close_outfp:
        outfp.close()
示例#30
0
def main(argv):
    import getopt
    def usage():
        print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:'
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
               ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
               ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
               ' [-t text|html|xml|tag] [-c codec] [-s scale]'
               ' file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = 'tag'
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'tag'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout

    for fname in args:
        l = glob.glob(fname)
        count = len(l)
        print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format'
        for pdf in l:
#             print pdf
            d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'}
            ext = '.' + d[outtype]
            outfile = pdf[0:-4] + ext
            print outfile
            outfp = file(outfile, 'wb')
            if outtype == 'text':
                device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'xml':
                device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                      imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'html':
                device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                                       layoutmode=layoutmode, laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'tag':
                device = TagExtractor(rsrcmgr, outfp, codec=codec)
                device.showpageno = False
            else:
                return usage()
    
            fp = file(pdf, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp, pagenos,
                                          maxpages=maxpages, password=password,
                                          caching=caching, check_extractable=True):
                page.rotate = (page.rotate+rotation) % 360
                interpreter.process_page(page)
            fp.close()
            device.close()
            outfp.close()

        print 'Done'
    return