예제 #1
0
def scrapePDFs(fullnames, profs, facconsulted):
    files = glob.glob("Candidates/*.pdf")
    havefiles = [",".join(os.path.split(f)[1].split("_")[:2]) for f in files]
    needfiles = [re.sub(r"[\s']", "", n) for n in fullnames]
    assert len(set(needfiles) -
               set(havefiles)) == 0, "Missing files for: {}".format(
                   set(needfiles) - set(havefiles))

    needfiles = np.array(needfiles)
    havefiles = np.array(havefiles)
    inds = np.array([np.where(havefiles == n)[0][0] for n in needfiles])

    havefiles = havefiles[inds]
    files = np.array(files)[inds]

    out = np.zeros(files.shape, dtype=object)
    badpages = []

    trans = str.maketrans(string.punctuation + "’\n",
                          " " * (len(string.punctuation) + 2))
    profstrans = np.array(
        [" {} ".format(prof.lower().translate(trans)) for prof in profs])

    facconsulted = facconsulted.astype(str)
    facconsulted[facconsulted == "nan"] = ""

    # Perform layout analysis for all text
    laparams = pdfminer.layout.LAParams()
    setattr(laparams, "all_texts", True)

    for ii, fname in enumerate(files):
        print("%d/%d: %s" % (ii, len(files), fname))

        rsrcmgr = PDFResourceManager()
        outfp = io.StringIO("")
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)

        with open(fname, "rb") as fp:
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for j, page in enumerate(
                    PDFPage.get_pages(fp, check_extractable=True)):
                try:
                    interpreter.process_page(page)
                except:
                    print("Unparsable page encountered.")
                    if fname not in badpages:
                        badpages.append(fname)

        txt = outfp.getvalue()
        device.close()
        outfp.close()
        txt = txt + " " + facconsulted[ii] + " "

        tmp = []
        for jj, prof in enumerate(profstrans):
            if prof in txt.lower().translate(trans):
                tmp.append(profs[jj])

        out[ii] = ", ".join(tmp)

    return out
예제 #2
0
def main(argv):
    def usage():
        print(('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
               '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
               '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]))
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    debug = False
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug = True
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    
    if debug:
        set_debug_logging()
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
        close_outfp = True
    else:
        outfp = sys.stdout
        close_outfp = False
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode,
            laparams=laparams, outdir=outdir, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        fp = io.open(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                    caching=caching, check_extractable=True)
        fp.close()
    device.close()
    if close_outfp:
        outfp.close()
예제 #3
0
import re
import numpy as np

from bs4 import BeautifulSoup
import bs4 as bs
import urllib.request

from delver import Crawler
C = Crawler()
CWD = os.getcwd()

from io import StringIO
import io

rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
codec = 'utf-8'
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)


def convert_pdf_to_txt(content):
    try:
        pdf = io.BytesIO(content.content)
    except:
        pdf = io.BytesIO(content)
    parser = PDFParser(pdf)
    document = PDFDocument(parser, password=None)  # this fails
    write_text = ''
예제 #4
0
        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda : {})
            for child in self.cur_item._objs:                #<-- changed
                if isinstance(child, LTChar):
                    (_,_,x,y) = child.bbox                   
                    line = lines[int(-y)]
                    line[x] = child._text.encode(self.codec) #<-- changed

            for y in sorted(lines.keys()):
                line = lines[y]
                self.outfp.write("".join(line[x] for x in sorted(line.keys())))
                self.outfp.write("\n")

    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())

    doc = PDFDocument()
    fp = open(case.pdf_name, 'rb')
    parser = PDFParser(fp)       
    parser.set_document(doc)     
    doc.set_parser(parser)       
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)


    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
예제 #5
0
def pdf_to_txt(pdf_name):

    # Variables

    sensitivity = 3  # Distance for lines to count as same line
    header = 750  # Cut off text above this height
    footer = 80  # Cutt off text below this height

    # Functions

    def order_pdf_textboxes(pdf_data, sensitivity, header, footer):
        df = pd.DataFrame(pdf_data)
        df.columns = ['x1', 'y1', 'x2', 'y2', 'Text']
        df['x1'] = pd.to_numeric(df.x1)
        df['x2'] = pd.to_numeric(df.x2)
        df['y1'] = pd.to_numeric(df.y1)
        df['y2'] = pd.to_numeric(df.y2)
        df = splitDataFrameList(df, 'Text', '\n')
        df = df.sort_values(['y2_new'], ascending=False).reset_index(drop=True)
        df.insert(0, 'Group', range(-1, -1 - len(df), -1))
        i = 0
        for index, row in df.iterrows():
            i = i + 1
            try:
                if abs(df.iloc[index]['y2_new'] -
                       df.iloc[index + 1]['y2_new']) < sensitivity:
                    df.set_value(index, 'Group', i)
                    df.set_value(index + 1, 'Group', i)
            except:
                pass
        df = df.sort_values(['x1'], ascending=True).reset_index(drop=True)
        df1 = df.groupby('Group', as_index=False).agg({
            'y2_new': 'first',
            'x1': 'first'
        })
        df = df.groupby([
            'Group'
        ])['Text'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
        df['y2_new'] = df1['y2_new']
        df = df.sort_values(['y2_new'], ascending=False)
        df = df[df.y2_new > footer]
        df = df[df.y2_new < header]
        return df['Text'].tolist()

    def splitDataFrameList(df, target_column, separator):
        def splitListToRows(row, row_accumulator, target_column, separator):
            split_row = row[target_column].split(separator)
            del split_row[-1]
            i = 0
            for s in split_row:
                new_row = row.to_dict()
                new_row[target_column] = s
                line_height = (new_row['y2'] -
                               new_row['y1']) / (len(split_row))
                new_row['y2_new'] = new_row['y2'] - (i * line_height)
                new_row['y1_new'] = new_row['y2'] - ((i + 1) * line_height)
                i = i + 1
                row_accumulator.append(new_row)

        new_rows = []
        df.apply(splitListToRows,
                 axis=1,
                 args=(new_rows, target_column, separator))
        new_df = pd.DataFrame(new_rows)
        return new_df

    def extract_from_element(x):
        text = x.get_text()
        text = re.sub('"', "'", str(text))
        reps = ("\u201c",
                '"'), ("\u201d", '"'), ("\u2013", '-'), ("\u2019", "'"), (
                    "\uf06c",
                    '-'), ("\uf06c",
                           '-'), ("\u2122",
                                  '(TM)'), ("\uf0b7",
                                            '-'), ("\u01b7",
                                                   '3'), ("\u0e00",
                                                          ' '), ("(cid:149)",
                                                                 'x')
        text = reduce(lambda a, kv: a.replace(*kv), reps, text)
        dims = str(x).split(' ')[1].split(',')
        return dims + [text]

    def list_to_txt(lists, fname):
        thefile = open(fname.replace(".pdf", ".txt"), 'w')
        for item in lists:
            item = str(item.encode("utf-8"))
            item = item[2:-1]
            thefile.write("%s\n" % item)

    # PDF extract code

    document = open(pdf_name, 'rb')
    #Create resource manager
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pdf_full = []
    #Loop through the pages
    for page in PDFPage.get_pages(document):
        pdf_data = []
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        # Extract only the text objects
        for element in layout:
            if "LTTextBoxHorizontal" not in str(element):
                continue
            else:
                pdf_data.append(extract_from_element(element))
        pdf_full = pdf_full + order_pdf_textboxes(pdf_data, sensitivity,
                                                  header, footer)

    list_to_txt(pdf_full, pdf_name)
예제 #6
0
def extract_text_from_pdf(pdf_path):
    '''
    Helper function to extract the plain text from .pdf files

    :param pdf_path: path to PDF file to be extracted (remote or local)
    :return: iterator of string of extracted text
    '''
    # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
    if not isinstance(pdf_path, io.BytesIO):
        # extract text from local pdf file
        with open(pdf_path, 'rb') as fh:
            try:
                for page in PDFPage.get_pages(
                        fh,
                        caching=True,
                        check_extractable=True
                ):
                    resource_manager = PDFResourceManager()
                    fake_file_handle = io.StringIO()
                    converter = TextConverter(
                        resource_manager,
                        fake_file_handle,
                        codec='utf-8',
                        laparams=LAParams()
                    )
                    page_interpreter = PDFPageInterpreter(
                        resource_manager,
                        converter
                    )
                    page_interpreter.process_page(page)

                    text = fake_file_handle.getvalue()
                    yield text

                    # close open handles
                    converter.close()
                    fake_file_handle.close()
            except PDFSyntaxError:
                return
    else:
        # extract text from remote pdf file
        try:
            for page in PDFPage.get_pages(
                    pdf_path,
                    caching=True,
                    check_extractable=True
            ):
                resource_manager = PDFResourceManager()
                fake_file_handle = io.StringIO()
                converter = TextConverter(
                    resource_manager,
                    fake_file_handle,
                    codec='utf-8',
                    laparams=LAParams()
                )
                page_interpreter = PDFPageInterpreter(
                    resource_manager,
                    converter
                )
                page_interpreter.process_page(page)

                text = fake_file_handle.getvalue()
                yield text

                # close open handles
                converter.close()
                fake_file_handle.close()
        except PDFSyntaxError:
            return
예제 #7
0
def _decodepdf(filename, debug=False):
    pdfstr = ""
    if debug == True:
        report("Debug mode enabled, reading inmates from rawfile.txt ...")
        with open("rawfile.txt", "r") as rawfile:
            pdfstr = rawfile.read()
        report("... Done reading inmates from disk.")
    else:
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

        report("Converting PDF to text ...")
        fp = file(filename, 'rb')
        process_pdf(rsrcmgr, device, fp)
        fp.close()
        device.close()
        pdfstr = retstr.getvalue()
        retstr.close()
        report("... Done converting PDF to text.")

        report("Saving converted PDF text to rawfile.txt ...")
        with open("rawfile.txt", "w") as rawfile:
            rawfile.write(pdfstr)
        report("... Done saving converted PDF text.")

    report("Preprocessing PDF Text ...")

    labels = [
        'Book Dt:', 'Book Typ:', 'Cus Typ:', 'Bail:', 'Bond:', 'Court:',
        'Judge:', 'Exp Rls:', 'Arr Agy:', 'Arr Typ:', 'ROC:', 'Chg:',
        'Indict:', 'Adj Dt:', 'Term:'
    ]

    for label in labels:
        pdfstr = pdfstr.replace(label, "\n{0} ".format(label))

    # preprocess to get rid of duplicate spaces and \n's
    pdfstr = re.sub(' +', ' ', pdfstr)
    pdfstr = re.sub('\n+', '\n', pdfstr)

    # handle current sentence going to next line
    # (time from custody date/time + first three capital leters form sentence type)
    pdfstr = re.sub('([0-9]{4})(?: )?\n([A-Z]{3})', '\\1 \\2', pdfstr)

    #ith open("rawfile2.txt","w") as rawfile:
    #   rawfile.write(pdfstr)

    # handle inmate ID not being on same line as inmate data
    # (in some casses mcid, sex, race, and dob can all be on different lines ...)
    pdfstr = re.sub(
        '([0-9]{6})(?: )?(?:\n)?([A-Z])(?: )?(?:\n)?([A-Z])(?: )?(?:\n)?([0-9]{2}-[0-9]{2}-[0-9]{2})',
        '\\1 \\2 \\3 \\4', pdfstr)

    #ith open("rawfile3.txt","w") as rawfile:
    #   rawfile.write(pdfstr)

    # remove page header
    pdfstr = re.sub(
        'Current Census for Date: [0-9]{2}-[0-9]{2}-[0-9]{4}(?: )?(?:\n)?', '',
        pdfstr)
    pdfstr = re.sub('Name(?: )?(?:\n)?Location(?: )?(?:\n)?', '', pdfstr)
    pdfstr = re.sub('MCJ Sex Rce DOB(?: )?(?:\n)?', '', pdfstr)
    pdfstr = re.sub(
        'Custody(?: )?(?:\n)?Time(?: )?(?:\n)?Date(?: )?(?:\n)?Classification(?: )?(?:\n)?',
        '', pdfstr)
    pdfstr = re.sub('Min(?: )?(?:\n)?Rel(?: )?(?:\n)?Date(?: )?(?:\n)?', '',
                    pdfstr)

    # remove page footers
    pdfstr = re.sub('Facility:(?: )?(?:\n)?', '', pdfstr)
    pdfstr = re.sub('Page \d+ of \d+(?: )?(?:\n)?', '', pdfstr)
    pdfstr = re.sub('Printed:(?: )?\n([0-9]{2}-[0-9]{2}-[0-9]{4}) [0-9]{4}',
                    '', pdfstr)

    # page-to-page formating issue
    pdfstr = re.sub('\x0C(?: )?(?:\n)?', '', pdfstr)

    # Note: probably not nessisary
    # post-process to get rid of duplicate spaces and \n's
    pdfstr = re.sub(' +', ' ', pdfstr)
    pdfstr = re.sub('\n+', '\n', pdfstr)

    #ith open("rawfile4.txt","w") as rawfile:
    #   rawfile.write(pdfstr)

    report("... Done Preprocessing PDF Text.")

    return pdfstr, True
예제 #8
0
def get_text_from_pdf(pdfname, limit=1000):
    # PDFファイル名が未指定の場合は、空文字列を返して終了
    if (pdfname == ''):
        return ''
    else:
        # 処理するPDFファイルを開く/開けなければ
        try:
            fp = open(pdfname, 'rb')
        except:
            return ''

    # PDFからテキストの抽出
    rsrcmgr = PDFResourceManager()
    out_fp = StringIO()
    la_params = LAParams()
    la_params.detect_vertical = True
    device = TextConverter(rsrcmgr, out_fp, codec='utf-8', laparams=la_params)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp,
                                  pagenos=None,
                                  maxpages=0,
                                  password=None,
                                  caching=True,
                                  check_extractable=True):
        interpreter.process_page(page)
    text = out_fp.getvalue()
    fp.close()
    device.close()
    out_fp.close()

    # 改行で分割する
    lines = text.splitlines()

    outputs = []
    output = ""

    # 除去するutf8文字
    replace_strs = [b'\x00']

    is_blank_line = False

    # 分割した行でループ
    for line in lines:

        # byte文字列に変換
        line_utf8 = line.encode('utf-8')

        # 余分な文字を除去する
        for replace_str in replace_strs:
            line_utf8 = line_utf8.replace(replace_str, b'')

        # strに戻す
        line = line_utf8.decode()

        # 連続する空白を一つにする
        line = re.sub("[ ]+", " ", line)

        # 前後の空白を除く
        line = line.strip()
        #print("aft:[" + line + "]")

        # 空行は無視
        if len(line) == 0:
            is_blank_line = True
            continue

        # 数字だけの行は無視
        if is_float(line):
            continue

        # 1単語しかなく、末尾がピリオドで終わらないものは無視
        if line.split(" ").count == 1 and not line.endswith("."):
            continue

        # 文章の切れ目の場合
        if is_blank_line or output.endswith("."):
            # 文字数がlimitを超えていたらここで一旦区切る
            if (len(output) > limit):
                outputs.append(output)
                output = ""
            else:
                output += "\r\n"
        #前の行からの続きの場合
        elif not is_blank_line and output.endswith("-"):
            output = output[:-1]
        #それ以外の場合は、単語の切れ目として半角空白を入れる
        else:
            output += " "

        #print("[" + str(line) + "]")
        output += str(line)
        is_blank_line = False

    outputs.append(output)
    return outputs
예제 #9
0
def parse(fileName):
    text_path = upload_path + fileName + ".pdf"
    hmPdfSaveName = ""
    fileOpen = open(text_path, 'rb')
    doc = PDFDocument()
    parser = PDFParser(fileOpen)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    #原文件
    hmPdfReaderEye = PyPDF2.PdfFileReader(fileOpen)
    hmPdfReaderHook = PyPDF2.PdfFileReader(fileOpen)
    #待写入数据文件
    hmPdfWriter = PyPDF2.PdfFileWriter()
    #检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #创建PDF,资源管理器,来共享资源
        rsrcmgr = PDFResourceManager()
        #创建一个PDF设备对象
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
        #创建一个PDF解释其对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        openFileArr = []
        allPages = doc.get_pages()
        for page in allPages:
            interpreter.process_page(page)
            layout = device.get_result()

            textValueArr = []
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    textValueArr.append(x.get_text())
            pdfTxt = ''.join(textValueArr)
            textValueArr.clear()

            dpText = bgGetProductDp(pdfTxt)
            if dpText != 'CHE':
                #print('不是背钩部生产单')
                continue

            #init
            hmPd = HmProduct()
            #生产车间
            hmPd.productDp = dpText
            #生产单编号
            hmPd.productCasNum = bgGetProductInvoicesNum(pdfTxt)
            #产品编号
            hmPd.productNum = bgGetProductNumber(pdfTxt, hmPd)
            #钩眼规格/排数/B数/尺寸
            hmPd.productSf = bgGetProductSpecification(pdfTxt, hmPd)
            #销售单号
            hmPd.productSealNum = bgGetProductSealNum(pdfTxt)
            #订单数量/单位
            hmPd.productCount = getHmProductCount(pdfTxt, hmPd)
            #客人号
            hmPd.productGuest = bgGetProductGuest(pdfTxt)
            #产品批次
            hmPd.productBatch = bgGetProductBatch(pdfTxt)
            #产品中文描述/颜色备注/产品补充说明/钩眼布筒/钩眼切法
            hmPd.productRamk = bgGetProductDetilRamk(pdfTxt, hmPd)
            #生成生产单uuid
            hmPd.hm_pd_uuid = bgGetPageMd5(hmPd)
            #处理特殊订单
            getHmHookEyeIsSpecial(hmPd)
            #产品排数粒数
            hmPd.productEPill = countHmPillx('E', hmPd)
            hmPd.productHPill = countHmPillx('H', hmPd)
            hmPd.productEYard = countHmYard(hmPd.productEPill, hmPd)
            hmPd.productHYard = countHmYard(hmPd.productHPill, hmPd)
            #产品钩眼公斤数
            hmPd.productEyeKG = bgGetHookEyeKg('E', hmPd)
            hmPd.productHookKG = bgGetHookEyeKg('H', hmPd)
            #根据总表更新生产单日期
            updateOutDate(hmPd, pdfTxt)

            #----------------1、生成文件_STAR----------------#
            payStr = ''
            layoutPageId = layout.pageid - 1

            #生成【眼】单
            if hmPd.clothTubeEye > 0:
                eyeNewPage = hmPdfReaderEye.getPage(layoutPageId)
                ePatch = hmCreateQRCode(hmPd, 'E', payStr)
                eMarkFile = open(ePatch, 'rb')
                pdfECodePage = PyPDF2.PdfFileReader(eMarkFile)
                eyeNewPage.mergePage(pdfECodePage.getPage(0))
                hmPdfWriter.addPage(eyeNewPage)
                openFileArr.append(eMarkFile)
                del eyeNewPage
                del pdfECodePage
                gc.collect()
            #生成【钩】单
            if hmPd.clothTubeHook > 0:
                hookNewPage = hmPdfReaderHook.getPage(layoutPageId)
                hPatch = hmCreateQRCode(hmPd, 'H', payStr)
                hMarkFile = open(hPatch, 'rb')
                pdfHCodePage = PyPDF2.PdfFileReader(hMarkFile)
                hookNewPage.mergePage(pdfHCodePage.getPage(0))
                hmPdfWriter.addPage(hookNewPage)
                openFileArr.append(hMarkFile)
                del hookNewPage
                del pdfHCodePage
                gc.collect()
            #用销售单号做文件名
            if hmPdfSaveName == "":
                hmPdfSaveName = hmPd.productSealNum
            #----------------1、生成文件_END----------------#

        #完结时关闭文件和保存文件
        #----------------生成文件时关闭----------------#
        nowTime = datetime.datetime.now()
        nowTimeStr = nowTime.strftime("%Y%m%d%H%M%S_s")
        hmPdfSaveName = nowTimeStr + "_" + hmPdfSaveName + ".pdf"
        hmPdfSavePath = download_path + hmPdfSaveName
        resultPdfFile = open(hmPdfSavePath, 'wb')
        hmPdfWriter.write(resultPdfFile)
        for closeItem in openFileArr:
            closeItem.close()
            os.remove(closeItem.name)
        openFileArr.clear()
        resultPdfFile.close()

        fileOpen.close()
        return hmPdfSaveName
예제 #10
0
def ConvertPdf(pdfpath, outfp, opts={}):
    import sys
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfdevice import PDFDevice, TagExtractor
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
    from pdfminer.cmapdb import CMapDB
    from pdfminer.layout import LAParams
    from pdfminer.image import ImageWriter

    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager()
    if not outtype:
        outtype = 'txt'
    if outtype == 'txt':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    fp = file(pdfpath, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()

    return True
예제 #11
0
def readfile(file):
    try:
        if file.startswith('https://') or file.startswith(
                'http://') or file.startswith('ftp://'):
            data = BytesIO(download(file))
        else:
            data = open(file, 'rb')

        if file.endswith('.caj') or file.endswith('.pdf'):
            with StringIO() as outfp:
                rsrcmgr = PDFResourceManager()
                device = TextConverter(rsrcmgr, outfp)
                process_pdf(rsrcmgr, device, data)
                return outfp.getvalue()
        elif file.endswith('.doc'):
            text = ''
            document = olefile.OleFileIO(data)

            wordDocument = document.openstream('WordDocument').read()

            # Parsing the WordDocument Stream
            # See https://msdn.microsoft.com/en-us/library/office/dd904907(v=office.14).aspx
            # And http://b2xtranslator.sourceforge.net/howtos/How_to_retrieve_text_from_a_binary_doc_file.pdf

            # Loading the FIB
            fib = wordDocument[:1472]

            # Loading and Parsing the piece table
            fcClx = int.from_bytes(fib[0x01A2:0x01A5], byteorder='little')
            lcbClx = int.from_bytes(fib[0x01A6:0x01A9], byteorder='little')

            tableFlag = ((int.from_bytes(
                fib[0x000A:0x000E], byteorder='little') & 0x0200) == 0x0200)
            tableName = ('0Table', '1Table')[tableFlag]

            table = document.openstream(tableName).read()

            clx = table[fcClx:fcClx + lcbClx]

            pos = 0
            pieceTable = ''
            lcbPieceTable = 0
            while True:
                if clx[pos] == 2:
                    # this entry is the piece table
                    lcbPieceTable = int.from_bytes(clx[pos + 1:pos + 5],
                                                   byteorder='little')
                    pieceTable = clx[pos + 5:pos + 5 + lcbPieceTable]
                    break
                elif clx[pos] == 1:
                    # skip this entry
                    pos = pos + 1 + 1 + ord(clx[pos + 1])
                else:
                    break

            i = 1
            pieceCount = (lcbPieceTable - 4) / 12
            while i <= pieceCount:
                cpStart = int.from_bytes(pieceTable[i * 4:i * 4 + 4],
                                         byteorder='little')
                cpEnd = int.from_bytes(pieceTable[(i + 1) * 4:(i + 1) * 4 + 4],
                                       byteorder='little')

                offsetPieceDescriptor = int(((pieceCount + 1) * 4) + (i * 8))
                pieceDescriptor = pieceTable[
                    offsetPieceDescriptor:offsetPieceDescriptor + 8]

                fcValue = int.from_bytes(pieceDescriptor[2:6],
                                         byteorder='little')
                isANSII = (fcValue & 0x40000000) == 0x40000000
                fc = fcValue & 0xBFFFFFFF

                encoding = ('utf-16', 'cp1252')[isANSII]
                cb = cpEnd - cpStart
                cb = (cb * 2, cb)[isANSII]
                text += wordDocument[fc:fc + cb].decode(encoding)

                i += 1

            return text
        elif file.endswith('.docx'):
            text = ''
            document = Document(data)

            text += '\n\n'.join(
                [paragraph.text for paragraph in document.paragraphs])

            for table in document.tables:
                text += _parse_docx_table(table, text)

            return text
        elif file.endswith('.htm') or file.endswith('.html'):
            html = html2text.HTML2Text()
            html.ignore_links = True
            return html.handle(data.read().decode('utf-8'))
        elif file.endswith('.rtf'):
            with BytesIO() as outfp:
                document = Rtf15Reader.read(data)
                return PlaintextWriter.write(document, outfp).getvalue()
        elif file.endswith('.txt'):
            return data.read()
        else:
            raise Exception('Unknown file extension')
    except:
        pass
예제 #12
0
        #3、创建文件解析器
        #具体的代码含义,都进行了注释
        #创建一个PDF文档解析器对象
        try:
            parser = PDFParser(fp)
            #创建一个PDF文档对象存储文档结构
            #提供密码初始化,没有就不用传该参数
            #document = PDFDocument(parser, password)
            document = PDFDocument(parser)
            #检查文件是否允许文本提取
            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed
#创建一个PDF资源管理器对象来存储共享资源
#caching = False不缓存
            rsrcmgr = PDFResourceManager(caching=False)
            # 创建一个PDF设备对象
            laparams = LAParams()
            # 创建一个PDF页面聚合对象
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            #创建一个PDF解析器对象
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            #处理文档当中的每个页面

            # doc.get_pages() 获取page列表
            #for i, page in enumerate(document.get_pages()):
            #PDFPage.create_pages(document) 获取page列表的另一种方式
            replace = re.compile(r'\s+')
            # 循环遍历列表,每次处理一个page的内容
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
예제 #13
0
파일: read_PDF.py 프로젝트: XQXZ/python-XB
#fp = urlopen("url")
#创建一个与文档关联的解释器
parser = PDFParser(fp)

#创建PDF文档对象
doc = PDFDocument()

#链接解释器和文档对象
parser.set_document(doc)
doc.set_parser(parser)

#初始化文档
doc.initialize("")

#创建PDF资源管理器
resource = PDFResourceManager()

#参数分析器
laparam = LAParams()

#PDF聚合器
device = PDFPageAggregator(resource, laparams=laparam)

#创建PDF页面解释器
interpreter = PDFPageInterpreter(resource, device)

#使用文档对象得到页面的集合
for page in doc.get_pages():
    #使用页面解释器读取
    interpreter.process_page(page)
예제 #14
0
    def get_transaction_list(self, address):
        # Create file pointer
        fp = open(address, 'rb')

        # Create parser object to parse the pdf content
        parser = PDFParser(fp)

        # Store the parsed content in PDFDocument object
        document = PDFDocument(parser, '')

        # Create PDFResourceManager object that stores shared resources such as fonts or images
        rsrcmgr = PDFResourceManager()

        # set parameters for analysis
        laparams = LAParams()

        # Create a PDFDevice object which translates interpreted information into desired format
        # Device needs to be connected to resource manager to store shared resources
        # device = PDFDevice(rsrcmgr)
        # Extract the decive to page aggregator to get LT object elements
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # Create interpreter object to process page content from PDFDocument
        # Interpreter needs to be connected to resource manager for shared resources and device
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Ok now that we have everything to process a pdf document, lets process it page by page
        for page in PDFPage.create_pages(document):
            # As the interpreter processes the page stored in PDFDocument object
            interpreter.process_page(page)
            # The device renders the layout from interpreter
            layout = device.get_result()
            # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox):
                    self.fetch_chars(lt_obj)
                if isinstance(lt_obj, LTTextLine):
                    self.fetch_chars(lt_obj)

            self.page_num += 1

        fp.close()

        # Since, we always read form left to right, optimize data to be read from left to right.
        # This would help us find the location of different headers. It is also required to find co-ordinates

        remapped = defaultdict(lambda: defaultdict(list))

        for char_text in self.extracted_text:
            remapped[char_text['page_num']][char_text['bbox'][1]].append(char_text)

        overall_end_marker = hdfc_parsing_spec['transactions']['overall_end_row']

        transactions = []

        current_transaction = {
            'txn_date': '',
            'txn_msg': '',
            'debit_amount': '',
            'credit_amount': '',
            'running_balance': ''
        }

        for page_num in range(1, len(remapped) + 1):
            dataset = []

            for key, val in remapped[page_num].items():
                dataset.append((key, val))

            dataset = sorted(dataset, key=lambda x: -x[0])

            if page_num in hdfc_parsing_spec['transactions']['page_conf']:
                start_marker = hdfc_parsing_spec['transactions']['page_conf'][page_num]['start_row']
                end_marker = hdfc_parsing_spec['transactions']['page_conf'][page_num]['end_row']
            else:
                start_marker = hdfc_parsing_spec['transactions']['page_conf']['default']['start_row']
                end_marker = hdfc_parsing_spec['transactions']['page_conf']['default']['end_row']

            has_started = False
            constructed_string = ''

            for row in dataset:
                row_sorted = sorted(row[1], key=lambda x: x['bbox'][0])
                constructed_string = ''

                prev = row_sorted[0]['bbox'][0]

                for entry in row_sorted:
                    # print(entry, end='\n\n\n\n')
                    if abs(prev - entry['bbox'][0]) < 1e-3:
                        constructed_string += entry['text']
                    else:
                        constructed_string += ' ' + entry['text']

                    prev = entry['bbox'][2]

                # print(constructed_string)

                if end_marker in constructed_string:
                    break

                if constructed_string == overall_end_marker:
                    for key, val in current_transaction.items():
                        if isinstance(val, list):
                            if len(val) == 0:
                                current_transaction[key] = ''
                                continue

                            part = ''

                            prev = val[0]['bbox'][0]

                            for entry in val:
                                if abs(prev - entry['bbox'][0]) < 1e-3:
                                    part += entry['text']
                                else:
                                    part += ' ' + entry['text']

                                prev = entry['bbox'][2]

                            current_transaction[key] = part

                    transactions.append(current_transaction)

                    break

                if has_started:
                    # print(constructed_string)

                    if len(constructed_string) >= 8:
                        try:
                            curr_date = datetime.datetime.strptime(
                                constructed_string[:hdfc_parsing_spec['transactions']['date_format']['length']],
                                hdfc_parsing_spec['transactions']['date_format']['date_string']
                            )

                            if current_transaction['txn_msg']:
                                for key, val in current_transaction.items():
                                    if isinstance(val, list):
                                        if len(val) == 0:
                                            current_transaction[key] = ''
                                            continue

                                        part = ''

                                        prev = val[0]['bbox'][0]

                                        for entry in val:
                                            if abs(prev - entry['bbox'][0]) < 1e-3:
                                                part += entry['text']
                                            else:
                                                part += ' ' + entry['text']

                                            prev = entry['bbox'][2]

                                        current_transaction[key] = part

                                transactions.append(current_transaction)

                            current_transaction = {
                                'txn_date': curr_date,
                                'txn_msg': [],
                                'debit_amount': [],
                                'credit_amount': [],
                                'running_balance': [],
                                'cheque_no': []
                            }
                        except:
                            pass

                    for entry in row_sorted:
                        for key, val in hdfc_parsing_spec['transactions']['cols_conf'].items():
                            if val[0] <= entry['bbox'][0] <= val[1]:
                                current_transaction[key].append(entry)

                if constructed_string == start_marker:
                    has_started = True

            if constructed_string == overall_end_marker:
                break

        initial_balance = (
                self.parse_float(transactions[0]['running_balance']) +
                self.parse_float(transactions[0]['debit_amount']) -
                self.parse_float(transactions[0]['credit_amount'])
        )

        for txn in transactions:
            new_balance = initial_balance - self.parse_float(txn['debit_amount']) + self.parse_float(
                txn['credit_amount'])

            if abs(new_balance - self.parse_float(txn['running_balance'])) > 1E-6:
                print(new_balance)
                print(self.parse_float(txn['running_balance']))
                print('>>> Date: {:%d-%b-%Y}, Msg: {}, Debit: {}, Credit: {}, Balance: {}'.format(
                    txn['txn_date'],
                    txn['txn_msg'],
                    txn['debit_amount'],
                    txn['credit_amount'],
                    txn['running_balance']
                ))

            initial_balance = new_balance

        transaction_list = []
        for transaction in transactions:
            if transaction['credit_amount'] == '':
                amount = -1 * float(transaction['debit_amount'].replace(',', ''))
            else:
                amount = float(transaction['credit_amount'].replace(',', ''))
            transaction_list_single = {
                'date': transaction['txn_date'],
                'chqNo': '',
                'balance': float(transaction['running_balance'].replace(',', '')),
                'narration': transaction['txn_msg'],
                'amount': amount
            }
            transaction_list.append(transaction_list_single)

        print(len(transaction_list))
        return transaction_list
예제 #15
0
def parse(_path):
    # fp = open(_path, 'rb')  # rb以二进制读模式打开 local读取
    request = Request(url=_path,
                      headers={'User-Agent': random.choice(user_agent)
                               })  # 随机从user_agent列表中抽取一个元素
    fp = urlopen(request)

    # 用文件对象来创建一个pdf文档分析器
    praser_pdf = PDFParser(fp)

    # 创建一个PDF文档
    doc = PDFDocument()

    # 连接分析器 与文档对象
    praser_pdf.set_document(doc)
    doc.set_parser(praser_pdf)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()

        # 创建一个PDF参数分析器
        laparams = LAParams()

        # 创建聚合器
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # 创建一个PDF页面解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一页的内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            # 使用页面解释器来读取
            interpreter.process_page(page)

            # 使用聚合器获取内容
            layout = device.get_result()

            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for out in layout:
                # 判断是否含有get_text()方法,图片之类的就没有
                # if hasattr(out,"get_text"):
                if isinstance(out, LTTextBoxHorizontal):

                    results = out.get_text()
                    # print("results: " + results)
                    with open(r'pdf_val.txt', 'a') as f:
                        if "运输完成情况" in results:
                            target_value = results.split("\n")
                            inland_amount = target_value[10]
                            foreign_amount = target_value[12]
                            print("国内货邮运输量:", inland_amount, "国际货邮运输量:",
                                  foreign_amount)
                            f.write("国内货邮运输量:" + inland_amount + ",国际货邮运输量:" +
                                    foreign_amount + "\n")
                            f.close()
예제 #16
0
def text_from_pdf(pdf_path, authors):
    extracted_text = ""
    affiliations = dict()
    author_lastnames = set()
    author_lastname_pattern = ""
    for author in authors:
        lastname = get_last_name(author[1])
        print("Lastname:", lastname)
        author_lastnames.add(lastname)
        author_lastname_pattern += lastname + "\s*,?\s*|"
    author_lastname_pattern = "(" + author_lastname_pattern[:-1] + ")"
    print("Author lastname pattern:", author_lastname_pattern)
    # Create a PDF parser object associated with the file object.
    infp = open(pdf_path, "rb")
    parser = PDFParser(infp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    rsrcmgr = PDFResourceManager(caching=True)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                tmp_text = re.sub("[^A-Za-z0-9 \s\.@]", "", lt_obj.get_text())
                author_count = 0
                for lastname in author_lastnames:
                    if re.search(lastname, tmp_text): author_count += 1
                if author_count > 0:
                    print("Author text block: ", tmp_text)
                    tmp_pattern = author_lastname_pattern + "{" + str(
                        author_count) + "}(.*)$"
                    affiliation_block = re.search(tmp_pattern, tmp_text,
                                                  re.DOTALL)
                    if affiliation_block:
                        print("Groups: ", affiliation_block.groups())
                        for lastname in author_lastnames:
                            if (not lastname in affiliations) and re.search(
                                    lastname, tmp_text):
                                if (len(affiliation_block.groups()) >
                                        author_count):
                                    affiliations[lastname] = re.sub(
                                        "\s+", " ",
                                        affiliation_block.group(author_count +
                                                                1))
                                else:
                                    affiliations[lastname] = re.sub(
                                        "\s+", " ", tmp_text)
                extracted_text += tmp_text + "\n"
        infp.close()
        device.close()
        if os.path.exists("working/temp"):
            os.remove("working/temp")
        outfp = open(temp_path, "w", encoding="utf-8")
        outfp.write(extracted_text)
        outfp.close()
        #os.remove(temp_path)
        return (extracted_text, affiliations)
예제 #17
0
    def convert_to_txt(self, data):
        extension = os.path.splitext(data)[1].lower()
        if extension == '.pdf':
            fp = file(data, 'rb')
            fname = data
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(rsrcmgr,
                                   retstr,
                                   codec=codec,
                                   laparams=laparams)
            # Create a PDF interpreter object.
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            # Process each page contained in the document.

            for page in PDFPage.get_pages(fp):
                interpreter.process_page(page)
                data = retstr.getvalue()

            with open(os.path.splitext(fname)[0] + '.txt',
                      'wb') as output_file:
                output_file.write(data)

            return os.path.splitext(os.path.split(fname)[1])[0] + '.txt'

        elif extension == '.docx':
            document = docx.Document(data)
            text = [paragraph.text for paragraph in document.paragraphs]
            text.extend([
                cell.text for table in document.tables for row in table.rows
                for cell in row.cells
            ])
            with open(os.path.splitext(data)[0] + '.txt', 'wb') as output_file:
                output_file.write('\n'.join(
                    [c for x in text for c in x if 32 <= ord(c) <= 127]))

            return os.path.splitext(data)[0] + '.txt'

        elif extension == '.doc':
            with open(os.path.splitext(data)[0] + '.txt', 'wb') as output_file:
                output_file.write("\n".join(
                    [x for x in doc2text(data) if 32 <= ord(x) <= 127]))

            return os.path.splitext(data)[0] + '.txt'

        elif extension == '.xlsx':
            wb = openpyxl.Workbook(data)
            text = [cell.value for ws in wb for cell in ws.rows]
            with open(os.path.splitext(data)[0] + '.txt', 'wb') as output_file:
                output_file.write("\n".join(
                    [x for x in text if 32 <= ord(x) <= 127]))

            return os.path.splitext(data)[0] + '.txt'

        elif extension == '.xls':
            wb = xlrd.open_workbook(data)
            text = [
                cell.value for ws in wb.sheets() for row in ws.get_rows()
                for cell in row
            ]
            with open(os.path.splitext(data)[0] + '.txt', 'wb') as output_file:
                output_file.write("\n".join([str(x) for x in text]))

            return os.path.splitext(data)[0] + '.txt'
예제 #18
0
def get_paper_info_from_pdf(data):
    fp = BytesIO(data)
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Initialize
    doc.initialize()
    # Extract the metadata
    for xref in doc.xrefs:
        info_ref = xref.trailer.get('Info')
        if info_ref:
            info = resolve1(info_ref)

    paper_info = {}
    if info:
        authors = info.get('Author')
        if authors:
            if ';' in authors:
                author_list = authors.split(';')
            elif ' AND ' in authors:
                author_list = authors.split(' AND ')
            elif ',' in authors:
                #FIXME: This cuts 'LastName, FirstName' in two...
                author_list = authors.split(',')
            else:
                author_list = [authors]

            paper_info['authors'] = author_list
        title = info.get('Title')
        if title:
            # Some PDFs have the doi as a title
            if title.lower().startswith('doi:'):
                paper_info['doi'] = title[4:]
            else:
                paper_info['title'] = title

        #TODO: Additional metadata?
        #TODO: What about embedded BibTeX (as done by JabRef)?

    #Extract text
    rsrcmgr = PDFResourceManager()
    content = cStringIO.StringIO()
    device = TextConverter(rsrcmgr,
                           content,
                           codec='utf-8',
                           laparams=LAParams())
    process_pdf(rsrcmgr, device, fp, check_extractable=True, caching=True)

    paper_info['extracted_text'] = content.getvalue()

    if not 'doi' in paper_info:  # Try to find a DOI in the text
        doi = p_doi.search(paper_info['extracted_text'])
        if doi is not None:
            doi = doi.group(1)
            log_debug('Found a DOI: %s' % doi)
            paper_info['doi'] = doi

    device.close()
    content.close()

    log_debug('Exctracted paper_info from PDF: %s' % paper_info)

    return paper_info
예제 #19
0
def read_pdf(file, range_set, output_file):
    sku = []
    fnsku = []
    pieces = []
    units = []
    cases = []
    total = []
    print(file)
    fp = open(file, 'rb')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.get_pages(fp)

    for page in pages:
        print('Processing next page...')
        interpreter.process_page(page)
        layout = device.get_result()
        for lobj in layout:
            if isinstance(lobj, LTTextBox):
                x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text(
                )  # 595 W X 842 H pixels
                # print('At %r is text: %s' % ((x, y), text))
                output_file.write('At %r is text: %s' % ((x, y), text))

                # sku
                import re
                if int(x) in range(35, 50):
                    if len(text) >= 6:
                        sku.append(text)
                        seg = text.split('-')
                        pieces.append(re.sub("[^0-9]", "", seg[1]))
                        # print('At %r is text: %s' % ((x, y), text))

                # fnsku
                if int(x) in range(85, 120):
                    if text[-11:][0:2] == 'X0' or text[-11:][0:2] == 'B0':
                        fnsku.append(text[-11:])
                        # print('At %r is text: %s' % ((x, y), text))

                # If SKU does not contain pieces-info, use below
                #
                # if 'pack' in text:
                #     pos = text.index('pack')
                #     pieces.append(text[pos-3:pos])
                # elif 'pcs' in text:
                #     pos = text.index('pcs')
                #     pieces.append(text[pos-3:pos])
                # elif 'pack of' in text:
                #     pos = text.index('pack of')
                #     pieces.append(text[pos + 8:pos + 9])
                # else:
                #     pieces.append(1)

                # units
                if int(x) in range(xrange[range_set][0][0],
                                   xrange[range_set][0][1]):
                    units.append(text)
                    # print('At %r is text: %s' % ((x, y), text))

                # cases
                if int(x) in range(xrange[range_set][1][0],
                                   xrange[range_set][1][1]):
                    cases.append(text)
                    # print('At %r is text: %s' % ((x, y), text))

                # total
                if int(x) in range(xrange[range_set][2][0],
                                   xrange[range_set][2][1]):
                    total.append(text)
                    # print('At %r is text: %s' % ((x, y), text))
    fp.close()
    return sku, fnsku, pieces, units, cases, total
예제 #20
0
파일: main.py 프로젝트: saonam/pdf2xml
def main(args):
    input_file = args[1]
    output_file = args[2]
    print args
    fp = open(input_file, 'rb')
    #with open('test.pdf','wb') as s:
    #    s.write(urllib2.urlopen(f).read())
    #fp = open('test.pdf', 'rb')
    filename = os.path.split(input_file)[1].split('.')[0]
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    laparams = LAParams()
    laparams.detect_vertical = True
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    root = ET.Element('xml')
    intro = ET.SubElement(root, 'intro')
    title = ET.SubElement(root, 'title')
    subtitle = ET.SubElement(root, 'subtitle')
    body = ET.SubElement(root, 'body')
    section = ET.SubElement(root, 'section')
    body.text = intro.text = title.text = subtitle.text = section.text = ' '
    tree = ET.ElementTree(root)
    global fonts, layout, images_list, filename
    fonts = Counter([])
    images_list = []
    all_objs = []
    for n, page in enumerate(PDFPage.create_pages(document)):
        #if n <> 0: continue
        print n
        objs = []
        objs_r = objs_l = ''
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        print n, layout.width, layout.height, layout.width / (layout.height *
                                                              1.0)

        # collecting objects from the all pages, sorting them by their Y coordinate
        objs.append(
            get_objects(layout)
        )  #sorted( get_objects(layout),key=lambda x:x.y0,reverse=True)     )
        objs = objs[0]  #sum(objs,[])
        objs = [
            i for i in objs if layout.height * 0.05 <= i.y0 <= layout.height -
            layout.height * 0.05
        ]
        all_objs.append(objs)
        fonts += get_fonts(objs)
    for n, objs in enumerate(all_objs):
        #determines if page is actually has two pages
        #if layout.width/(layout.height*1.0) > 0.8:
        # print 'aaa'
        #    objs_l = [i for i in sorted(objs,key = lambda x:x.x0) if i.x0<= layout.width/2]
        #    objs_r = [i for i in sorted(objs,key = lambda x:x.x0) if i.x0>  layout.width/2]
        tree = make_xml(objs, n, tree)
        print 'end'
    soup = BeautifulSoup(
        HTMLParser().unescape(
            ET.tostring(tree, encoding='unicode', method='xml')).replace(
                '&', '&amp;').replace(' >', '&gt;').replace('< ', '&lt;'),
        'xml')
    with open(output_file, 'wb') as f:
        f.write(str(soup.prettify().encode('utf-8')))
예제 #21
0
def extractpdf(ta_from, endstatus, **argv):
    ''' Try to extract text content of a PDF file to a csv.
        You know this is not a great idea, right? But we'll do the best we can anyway!
        Page and line numbers are added to each row.
        Columns and rows are based on the x and y coordinates of each text element within tolerance allowed.
        Multiple text elements may combine to make one field, some PDFs have every character separated!
        You may need to experiment with x_group and y_group values, but defaults seem ok for most files.
        Output csv is UTF-8 encoded - The csv module doesn't directly support reading and writing Unicode
        If the PDF is just an image, all bets are off. Maybe try OCR, good luck with that!
        Mike Griffin 14/12/2011
    '''
    from pdfminer.pdfinterp import PDFResourceManager, process_pdf
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams, LTContainer, LTText, LTTextBox
    import csv

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)

        def receive_layout(self, ltpage):

            # recursively get every text element and it's coordinates
            def render(item):
                if isinstance(item, LTContainer):
                    for child in item:
                        render(child)
                elif isinstance(item, LTText):
                    (unused1, unused2, x, y) = item.bbox

                    # group the y values (rows) within group tolerance
                    for v in yv:
                        if y > v - y_group and y < v + y_group:
                            y = v
                    yv.append(y)

                    line = lines[int(-y)]
                    line[x] = item.get_text().encode('utf-8')

            from collections import defaultdict
            lines = defaultdict(lambda: {})

            yv = []
            render(ltpage)

            lineid = 0
            for y in sorted(lines.keys()):
                line = lines[y]
                lineid += 1
                csvdata = [ltpage.pageid,
                           lineid]  # first 2 columns are page and line numbers

                # group the x values (fields) within group tolerance
                p = 0
                field_txt = ''
                for x in sorted(line.keys()):
                    gap = x - p
                    if p > 0 and gap > x_group:
                        csvdata.append(field_txt)
                        field_txt = ''
                    field_txt += line[x]
                    p = x
                csvdata.append(field_txt)
                csvout.writerow(csvdata)
            if lineid == 0:
                raise botslib.InMessageError(
                    _('PDF text extraction failed, it may contain just image(s)?'
                      ))

    #get some optional parameters
    x_group = argv.get('x_group',
                       10)  # group text closer than this as one field
    y_group = argv.get('y_group',
                       5)  # group lines closer than this as one line
    password = argv.get('password', '')
    quotechar = argv.get('quotechar', '"')
    field_sep = argv.get('field_sep', ',')
    escape = argv.get('escape', '\\')
    charset = argv.get('charset', 'utf-8')
    if not escape:
        doublequote = True
    else:
        doublequote = False

    try:
        pdf_stream = botslib.opendata_bin(ta_from.filename, 'rb')
        ta_to = ta_from.copyta(status=endstatus)
        tofilename = unicode(ta_to.idta)
        csv_stream = botslib.opendata_bin(tofilename, 'wb')
        csvout = csv.writer(csv_stream,
                            quotechar=quotechar,
                            delimiter=field_sep,
                            doublequote=doublequote,
                            escapechar=escape)

        # Process PDF
        rsrcmgr = PDFResourceManager(caching=True)
        device = CsvConverter(rsrcmgr, csv_stream, codec=charset)
        process_pdf(rsrcmgr,
                    device,
                    pdf_stream,
                    pagenos=set(),
                    password=password,
                    caching=True,
                    check_extractable=True)

        device.close()
        pdf_stream.close()
        csv_stream.close()
        filesize = os.path.getsize(botslib.abspathdata(tofilename))
        ta_to.update(
            statust=OK, filename=tofilename,
            filesize=filesize)  # update outmessage transaction with ta_info;
        botsglobal.logger.debug(_('        File written: "%(tofilename)s".'),
                                {'tofilename': tofilename})
    except:
        txt = botslib.txtexc()
        botsglobal.logger.error(
            _('PDF extraction failed, may not be a PDF file? Error:\n%(txt)s'),
            {'txt': txt})
        raise botslib.InMessageError(
            _('PDF extraction failed, may not be a PDF file? Error:\n%(txt)s'),
            {'txt': txt})
예제 #22
0
    def run(filepath):
        import getopt

        # debug option
        debug = 0
        # input option
        password = ''
        pagenos = set()
        maxpages = 0
        # output option
        outfile = 'pdfparser/minute_store/minutes.txt'
        outtype = 'text'
        imagewriter = None
        rotation = 0
        stripcontrol = False
        layoutmode = 'normal'
        codec = 'utf-8'
        pageno = 1
        scale = 1
        caching = True
        showpageno = True
        laparams = LAParams()

        #
        PDFDocument.debug = debug
        PDFParser.debug = debug
        CMapDB.debug = debug
        PDFPageInterpreter.debug = debug
        #
        rsrcmgr = PDFResourceManager(caching=caching)

        if outfile:
            outfp = open(outfile, 'w')
        else:
            outfp = sys.stdout
        if outtype == 'text':
            device = TextConverter(rsrcmgr,
                                   outfp,
                                   codec=codec,
                                   laparams=laparams,
                                   imagewriter=imagewriter)
        elif outtype == 'xml':
            device = XMLConverter(rsrcmgr,
                                  outfp,
                                  codec=codec,
                                  laparams=laparams,
                                  imagewriter=imagewriter,
                                  stripcontrol=stripcontrol)

        fp = open(filepath, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()

        device.close()
        outfp.close()

        print("Converted PDF to Text")
        return
def lambda_handler(event, context):

    # Grab file that was just uploaded to S3 bucket's "pdf" directory
    bucket = event['Records'][0]['s3']['bucket']['name']
    s3_new_arrived_filename = urllib.unquote_plus(
        event['Records'][0]['s3']['object']['key'].encode('utf8'))
    print('Reading file ' + s3_new_arrived_filename + ' from S3')
    extracted_results_from_pdf = '/tmp/extract.xml'
    downloaded_pdf_file = '/tmp/input.pdf'
    #download file into /tmp
    s3.meta.client.download_file(bucket, s3_new_arrived_filename,
                                 downloaded_pdf_file)
    print('Downloaded file ' + s3_new_arrived_filename + ' from S3')

    # extract pdf into xml and upload xml to S3 bucket's "xml" directory
    resource_mgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = XMLConverter(resource_mgr, retstr, codec=codec, laparams=laparams)
    maxpages = 0
    caching = True
    pagenos = set()
    infile_pdf_fp = file(downloaded_pdf_file, 'rb')
    interpreter = PDFPageInterpreter(resource_mgr, device)
    for page in PDFPage.get_pages(infile_pdf_fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password='',
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    data = retstr.getvalue()  # xml data extracted from pdf
    device.close()
    retstr.close()

    # write xml (extracted from pdf) to a new file
    print('Opening file ' + extracted_results_from_pdf +
          ' to write extracted xml from ' + s3_new_arrived_filename)
    outfile_xml_fp = file(extracted_results_from_pdf, 'w')
    print('Opened file ' + extracted_results_from_pdf)
    outfile_xml_fp.write(data)
    # pdfminer has a bug wherein it misses out the last </pages> tag in the extracted xml. Hence, adding this last tag manually.
    # Bug reported: https://github.com/euske/pdfminer/issues/229
    outfile_xml_fp.write("</pages>")
    outfile_xml_fp.close()
    filename_without_folderprefix_and_ext = re.sub(
        r'.*/', '',
        os.path.splitext(s3_new_arrived_filename)[0])
    extracted_xml_filename_in_s3 = 'xml/' + filename_without_folderprefix_and_ext + '.xml'
    s3.meta.client.upload_file(extracted_results_from_pdf, bucket,
                               extracted_xml_filename_in_s3)

    # Publish to "StockDataExtracted" SNS topic. Send location of newly extracted XML in S3 in the message to SNS topic. This topic triggers the next lambda function - get_recommended_stocks
    message = {"topten_trader_xml_filepath": extracted_xml_filename_in_s3}
    sns_client = boto3.client('sns', region_name='us-east-1')
    sns_response = sns_client.publish(
        TargetArn='arn:aws:sns:us-east-1:<aws_account_#>:stock_data_extracted',
        Message=json.dumps({'default': json.dumps(message)}),
        Subject='Stock Buy Recommendations ' + str(datetime.date.today()),
        MessageStructure='json')
예제 #24
0
    def getEntityPDFJson(self):
        #searchable = isSearchablePDF()
        counter = 1

        print("PDF File")
        fp = open(self.filename, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.char_margin = 1.0
        laparams.word_margin = 1.0
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = ''
        isEnglish = True
        relationsList = []
        uniqueEntities = []
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:

                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):

                    if (self.lang not in "eng"):
                        extracted_text = Translator.translate_(
                            lt_obj.get_text(), self.credentials)
                    else:
                        extracted_text = lt_obj.get_text()

                    n1 = extracted_text.replace("\t", " ")
                    n2 = n1.replace("\r", "")
                    n3 = n2.replace("\n", "")
                    finaltext = n3.replace("\u00a0", "")
                    doc = nlp(finaltext)
                    paras = extracted_text.split("\n\n")
                    for p in paras:

                        for line in p.split("\n"):
                            output = self.relationExtractor.annotate(
                                line,
                                properties={
                                    "annotators":
                                    "tokenize,ssplit,pos,depparse,natlog,openie",
                                    "outputFormat": "json",
                                    "openie.triple.strict": "true",
                                    "openie.max_entailments_per_clause": "1"
                                })
                            if (output != None):
                                if (len(output["sentences"]) > 0):
                                    result = [
                                        output["sentences"][0]["openie"]
                                        for item in output
                                    ]
                                    # print(len(result))
                                    for i in result:
                                        for rel in i:
                                            relationSent = rel['subject'], rel[
                                                'relation'], rel['object']
                                            print(relationSent)
                                            relation = RelationTriple(
                                                rel['subject'], rel['object'],
                                                rel['relation'],
                                                self.document_url)
                                            relationsList.append(relation)
                                            del relation

                    for X in doc.ents:
                        if X.text != ('\n') and X.label_ not in (
                                'ORDINAL', 'CARDINAL', 'NORP',
                                'Non-­‐binding'):

                            if (self.isAlreadyThere(uniqueEntities,
                                                    X.text) == False):
                                self.listEntities.append(X.text + ",")
                                uniqueEntities.append(X.text)

        print(self.listEntities)
        print("Relations Count:" + str(len(relationsList)))
        self.insertRelation(relationsList)

        return self.listEntities
예제 #25
0
def extract(pdf_path):
  
  resource_manager = PDFResourceManager()
  fake_file_handle = io.StringIO()
  converter = TextConverter(resource_manager, fake_file_handle)
  page_interpreter = PDFPageInterpreter(resource_manager, converter)
  with open(pdf_path, 'rb') as fh:
  
      for pg in PDFPage.get_pages(fh,
                                    caching=True,
                                    check_extractable=True):
          page_interpreter.process_page(pg)
      text = fake_file_handle.getvalue()
      converter.close()
      fake_file_handle.close()
      print(text)

      #tender notice
      tn=re.search(r'Reference.*?Tender',text).group()
      tn1=re.search(r'Number.*?Tender',tn).group()
      tn1=tn1.replace('Number','')
      tn1=tn1.replace('Tender','')
      
      #tnotice=re.search(r'\d{2}(.)\d{2}(-)\d{2}',tn).group()
      print('Tender notice no :-',tn1)

      #tender type
      ty=re.search(r"Reference.{200}",text).group()
      tenderty=re.search(r"Type.*?Form",ty).group().replace("Type",'').replace("Form",'').replace(" ",'')
      print("Tender Type :-",tenderty)

      #tender catagory
      tenderc=re.search(r"Category.*?No",text).group().replace("Category",'').replace("No",'').replace(" ",'')
      print("Tender Category :-",tenderc)

      #ernest money
      emd=re.search(r"Amount.*?EMD",text).group()
      ernest=re.search(r"₹.*?E",emd).group().replace('E','').replace(' ','')
      print("Earnest Money :- ",ernest)

      #Published date

      pd=re.search(r'Published.*?Bid',text).group()
      publishdate=re.search(r'Date.*?Bid',pd).group().replace("Date",'').replace("Bid",'')#.replace(" ","")
      print("Published date :-",publishdate)

      #Tender value

      tv=re.search(r'Value.*?Product',text).group()
      tenderval=re.search(r'₹.*?P',tv).group().replace('P','')
      print("Tender value :-",tenderval)

      #Tenderfee
      tf=re.search(r'Total.*?Payable',text).group()
      tf1=re.search(r"Tender.*?P",tf).group()
      tenderfee=re.search(r"₹.*?P",tf1).group().replace('P','').replace("Fee",'')
      print('Tender fee :-',tenderfee)

      #Authority name
      an=re.search(r'Authority.*?Address',text).group()
      authname=re.search(r'Name.*?Address',an).group().replace("Name",'').replace("Address",'')
      print("Authority name :-",authname)

      #Authority add
      ad=re.search(r'Authority.*?Back',text).group()
      authadd=re.search(r'Address.*?Back',ad).group().replace('Address','').replace('Back','')
      print("Authority Address :-",authadd)

      #location
      loc=re.search(r'Period.*?Pincode',text).group()
      location=re.search(r'Location.*?Pincode',loc).group().replace('Location','').replace('Pincode','')
      print("Tender Location :-",location)

      #Bid submission start date
      bsd=re.search(r'Submission.*?Bid',text).group()
      bidstart=re.search(r'Date.*?Bid',bsd).group().replace("Date",'').replace("Bid",'')
      print("Bis submission start date :-",bidstart)
      
      #Bid submission end date
      bed=re.search(r'Submission.*?Tender',text).group()
      bidend=re.search(r'End.*?Tender',bed).group().replace('End','').replace('Tender','').replace('Date','')
      print("Bid submission end date :-",bidend)

      #Bid opening date
      bod=re.search(r'Published.{130}',text).group()
      bod1=re.search(r'Bid.*?Document',bod).group()
      bidopen=re.search(r'Date.*?Document',bod1).group().replace('Date','').replace('Document','')
      print('Bid opening date :-',bidopen)

      #Tender Title
      tt=re.search(r'EMD.*?NDA',text).group()
      tt1=re.search(r'Title.*Work',tt).group()
      tt1=tt1.replace('Title','')
      tt1=tt1.replace('Work','')
      print("Tender Title :-",tt1)

      #Work Description
      #wdd=re.search(r'Fee.*?Qu',text).group()
      wd=re.search(r'EMD.*?NDA',text).group()
      workdes=re.search(r'Work.*?NDA',wd).group()
      workdes=workdes.replace('Work','')
      workdes=workdes.replace('NDA','')
      print("Work description :-",workdes)

      #Project State:
      prostate="Maharastra"
      print("Project state :-",prostate)

      #Project country:
      country="India"
      print("Tender Country",country)

      #product Catagory:
      pcat=re.search(r'Product.*?Sub',text).group()
      pcat=pcat.replace('Product','')
      pcat=pcat.replace('Category','')
      pcat=pcat.replace('Sub','')
      print("Product Catagory :-",pcat)

      #Document sale start date
      dsd=re.search(r'Sale.{100}',text).group()
      dsd1=re.search(r'Date.*?Document',dsd).group()
      dsd1=dsd1.replace('Date','')
      dsd1=dsd1.replace('Document','')
      print("Document sale start date :-",dsd1)

      #Document sale end date
      ded=re.search(r'Sale.{100}',text).group()
      ded1=re.search(r'End.*?Clarification',ded).group()
      ded2=re.search(r'Date.*?Clarification',ded1).group()
      ded2=ded2.replace('Date','')
      ded2=ded2.replace('Clarification','')
      print('Document sale end date  :-',ded2)


      #Product name:
      #pn=re.search()
      product="N.A"
      condition="False"
      list=["Flowers","High Security Registration Plates","R.O.Plant","SOLAR STREET LIGHT AND SOLAR PUMP","White LED"]
      for i1 in list:
         if i1 in tt1:
            product=i1
            condition="True"
     


      wb=op.load_workbook('tenderauto.xlsx')
      ws=wb.active

      ws.append(['Tender Notice NO','Tender type','Product category','Authority Name','Project State','EMD','Tender Value','Tender Country',
                 'Contact Email ID','Authority website','Tender Title','Tender Description','Bid Open Date','Phone no',
                 'Fax no','Document sale end date','Product name','Tender publish date','Tender document url'])
      wb.save(filename='tenderauto.xlsx')

      data=[tn1,tenderty,pcat,authname,prostate,ernest,tenderval,country,"",url,tt1,workdes,bidopen,"NA","NA",ded2,product,publishdate,kurl]
      #print(data)
      ws.append(data)
      wb.save(filename='tenderauto.xlsx')
예제 #26
0
    def mine(self):
        #PDFMiner
        parser = PDFParser(self.fp)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        laparams = LAParams()
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        pages = PDFPage.create_pages(document)
        index = 0
        for page in pages:
            interpreter.process_page(page)
            layout = device.get_result()
            self.layouts.append(layout)
            self.page_num += 1

            counter = {x: 0 for x in object_type}
            page_height = layout.bbox[3]
            page_width = layout.bbox[2]
            page_elem = {x: [] for x in elem_type}

            figure_groups = {}
            textBoxs = {'left': [], 'middle': [], 'right': [], 'all': []}
            line_groups = []

            for x in layout:
                if (isinstance(x, LTTextBox)):
                    textBoxs[positionClassifier(x, page_width)].append(x)
                    textBoxs['all'].append(x)
                    counter['TextBox'] += 1
                elif (isinstance(x, LTFigure)):
                    if (x.bbox[3] - x.bbox[1] < 5):
                        continue
                    #find figure title (the nearest textbox)
                    #the numbers of figures and textboxs are not so big
                    minn = 100000000
                    title = None
                    posGroups = posGroups_settings[positionClassifier(
                        x, page_width)]
                    for group in posGroups:
                        for y in textBoxs[group]:
                            if (x.bbox[1] < y.bbox[3] or y.get_text()[0] == '('
                                    or len(y.get_text()) < MIN_LENTH):
                                continue
                            dst = rectDistance(x, y)
                            if (dst < minn):
                                minn = dst
                                title = y
                    if (title == None):
                        title = notFound()
                    #merge the near textBoxs
                    title_merged = mergedText(title, textBoxs['all'])
                    title_text = title_merged.text
                    if title_text in figure_groups:
                        add_sub = figure(x)
                        figure_groups[title_text].addFigure(add_sub)
                    else:
                        new_fig = figure(x)
                        new_group = figureGroup(new_fig)
                        new_group.setTitle(title_merged)
                        figure_groups[title_text] = new_group
                        page_elem['Figure'].append(new_group)
                    counter['Figure'] += 1

                elif (isinstance(x, LTLine)):
                    #horizontal lines
                    if (equal(x.bbox[1], x.bbox[3])
                            and x.bbox[2] - x.bbox[0] > page_width / 6):
                        flag = False
                        for lineGroup in line_groups:
                            if (equal(lineGroup.bbox[0], x.bbox[0])
                                    and equal(lineGroup.bbox[2], x.bbox[2])):
                                lineGroup.addLine(x)
                                flag = True
                                break
                        if (flag == False):
                            new_group = Table(x)
                            line_groups.append(new_group)
                    counter['Line'] += 1
                counter['Object'] += 1

            for x in object_type:
                self.num[x] += counter[x]

            #find tables
            for table in line_groups:
                if (table.lineNum < 2 or table.bbox[2] - table.bbox[0] < 50):
                    continue
                text = self.text_in_rect(table, textBoxs['all'])
                if (len(text) != 0):
                    if (re.search("Algorithm", text[0].get_text())):
                        page_elem['Algorithm'].append(table)
                        continue
                #split lines into groups
                divided_text = [[] for i in range(1, len(table.lines))]

                for t in text:
                    mid_y = midPoint(t)[1]
                    for i in range(1, len(table.lines)):
                        if (mid_y > table.lines[i][1]):
                            divided_text[i - 1].append(t)
                            break
                split_tables = []
                prev_i = 0
                for i in range(len(table.lines) - 1):
                    if (len(divided_text[i]) == 0):
                        continue
                    if (len(divided_text[i]) == 1
                            and divided_text[i][0].bbox[2] -
                            divided_text[i][0].bbox[0] > 1 / 4 *
                        (table.bbox[2] - table.bbox[0])):
                        #split the table
                        new_table = Table()
                        new_table.setLines(table.lines[prev_i:i + 1])
                        prev_i = i + 1
                        split_tables.append(new_table)
                #the last table
                new_table = Table()
                new_table.setLines(table.lines[prev_i:len(table.lines)])
                split_tables.append(new_table)

                for split_table in split_tables:
                    minn = 100000000
                    title = None
                    posGroups = posGroups_settings[positionClassifier(
                        split_table, page_width)]
                    for group in posGroups:
                        for y in textBoxs[group]:
                            #the title is above the table
                            if (split_table.bbox[3] > y.bbox[1]):
                                continue
                            dst = rectDistance(table, y)
                            if (dst < minn):
                                minn = dst
                                title = y
                    if (title == None):
                        title = notFound()
                    title_merged = mergedText(title, textBoxs['all'])
                    split_table.setTitle(title_merged)
                    page_elem['Table'].append(split_table)
            #assert index!=1
            for x in elem_type:
                self.elem[x].append(page_elem[x])
            index += 1
예제 #27
0
    """
    """
    if isinstance(layout_obj, LTTextBox):
        return [layout_obj]

    if isinstance(layout_obj, LTContainer):
        boxes = []
        for child in layout_obj:
            boxes.extend(find_textboxes_recursively(child))
        return boxes

    return []


laparams = LAParams(detect_vertical=True)
resource_manager = PDFResourceManager()
device = PDFPageAggregator(resource_manager, laparams=laparams)
interpreter = PDFPageInterpreter(resource_manager, device)

with open(sys.argv[1], 'rb') as f:
    for page in PDFPage.get_pages(f):
        interpreter.process_page(page)
        layout = device.get_result()

        boxes = find_textboxes_recursively(layout)
        boxes.sort(key=lambda b: (-b.y1, b.x0))

        for box in boxes:
            print('-' * 10)
            print(box.get_text().strip())
예제 #28
0
def main(argv):
    import getopt

    def usage():
        print(
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
            ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
            ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]'
            ' [-t text|html|xml|tag] [-c codec] [-s scale]'
            ' file ...' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-S': stripcontrol = True
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return
예제 #29
0
    def __init__(self,
                 file,
                 merge_tags=('LTChar', 'LTAnno'),
                 round_floats=True,
                 round_digits=3,
                 input_text_formatter=None,
                 normalize_spaces=True,
                 resort=True,
                 parse_tree_cacher=None,
                 laparams={
                     'all_texts': True,
                     'detect_vertical': True
                 },
                 password=''):
        # store input
        self.merge_tags = merge_tags
        self.round_floats = round_floats
        self.round_digits = round_digits
        self.resort = resort

        # set up input text formatting function, if any
        if input_text_formatter:
            self.input_text_formatter = input_text_formatter
        elif normalize_spaces:
            r = re.compile(r'\s+')
            self.input_text_formatter = lambda s: re.sub(r, ' ', s)
        else:
            self.input_text_formatter = None

        # open doc
        if not hasattr(file, 'read'):
            try:
                file = open(file, 'rb')
            except TypeError:
                raise TypeError("File must be file object or filepath string.")

        parser = PDFParser(file)
        if hasattr(QPDFDocument, 'set_parser'):
            # pdfminer < 20131022
            doc = QPDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
        else:
            # pdfminer >= 20131022
            doc = QPDFDocument(parser, password)
            parser.set_document(doc)
        if hasattr(doc, 'initialize'):
            # as of pdfminer==20140328, "PDFDocument.initialize() method is
            # removed and no longer needed."
            doc.initialize()
        self.doc = doc
        self.parser = parser
        self.tree = None
        self.pq = None
        self.file = file

        if parse_tree_cacher:
            self._parse_tree_cacher = parse_tree_cacher
            self._parse_tree_cacher.set_hash_key(self.file)
        else:
            self._parse_tree_cacher = DummyCache()

        # set up layout parsing
        rsrcmgr = PDFResourceManager()
        if type(laparams) == dict:
            laparams = LAParams(**laparams)
        self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)

        # caches
        self._pages = []
        self._pages_iter = None
        self._elements = []
예제 #30
0
파일: read_pdf.py 프로젝트: xiao-chen/zobt
sys.path = [
    os.path.realpath(
        os.path.join(os.path.dirname(os.path.realpath(__file__)),
                     "./pdfminer"))
] + sys.path
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar

fp = open("hb.pdf", "rb")  #打开pdf文件
parser = PDFParser(fp)  #用文件对象来创建一个pdf文档分类器
doc = PDFDocument(parser)  #创建一个pdf文档
rs = PDFResourceManager()  #创建pdf资源管理器来管理共享资源

#创建一个pdf设备对象
lapara = LAParams()
device = PDFPageAggregator(rs, laparams=lapara)
inte = PDFPageInterpreter(rs, device)

#处理文档对象中每一页的内容
#doc.get_pages()获取page列表
#循环遍历列表,每次处理一个page的内容,
#这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括
#LTTextBox, LTFigure, ,LTImage,LTTextBoxHorizontal 等等   想要获取文本就获得对象的txt属性

for page in PDFPage.create_pages(doc):
    inte.process_page(page)
    layout = device.get_result()