Python PDFPageInterpreter.PDFPageInterpreter示例，pdfminer.pdfinterp.PDFPageInterpreter.PDFPageInterpreter Python示例

示例#1

0

显示文件

def parse(pdf_path):
    global eps
    # 保存文本内容
    key = pdf_path.split('/')[-1]
    print('extracting from ', key)
    fp = open(pdf_path, 'rb')  # 以二进制读模式打开
    # 用文件对象来创建一个pdf文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    try:
        doc.initialize()
    except PDFEncryptionError:
        return

    # 检测文档是否提供txt转换，不提供就忽略
    if not doc.is_extractable:
        return
        # raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 用来计数页面，图片，曲线，figure，水平文本框等对象的数量
        num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0

        # 循环遍历列表，每次处理一个page的内容
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            num_page += 1  # 页面增一
            pre_sent = {'text': '', 'height': 0, 'left': 0, 'width': 0}
            post_sent = {'text': '', 'height': 0, 'left': 0, 'width': 0}
            pre_flag = False
            post_flag = False
            try:
                interpreter.process_page(page)
            except KeyError:
                continue
            except AssertionError:
                continue
            except OSError:
                continue
            f = open(key[:-4] + '.txt', 'a', encoding='utf-8')
            f.write('\n\n')
            f.close()
            # 接受该页面的LTPage对象
            layout = device.get_result()

            text_dic_list = []  #建立空字典链表，其值为宽度相同的字符串的拼接
            for x in layout:
                if isinstance(x, LTImage):  # 图片对象
                    num_image += 1
                if isinstance(x, LTCurve):  # 曲线对象
                    num_curve += 1
                if isinstance(x, LTFigure):  # figure对象
                    num_figure += 11
                if isinstance(x, LTTextBoxHorizontal):  # 获取文本内容
                    num_TextBoxHorizontal += 1  # 水平文本框对象增一
                    # results = x.get_text()
                    results = ""
                    for i in x._objs:
                        for j in i._objs:
                            temch = j._text[0]
                            for w in range(1, len(j._text)):
                                if is_chinese(j._text[w]):
                                    ch = j._text[w]
                                    break
                            results += temch
                    height = x._avg_lineheight
                    for gethei in range(len(x._objs[0]._objs)):
                        if is_chinese(x._objs[0]._objs[gethei]._text[0]):
                            height = x._objs[0]._objs[gethei].height

                    if match_pattern(results):  #检测是否符合启发式规则
                        nresults = spe_pun_drop(results)
                        inserted = False
                        for item in text_dic_list:  #主要清洗方式，为联合在pdf分行的同一个语句，将同一页中所有宽度相同的句子联合，并用空格分隔
                            if (abs(item['hide'] - (height)) < eps
                                ) and abs(item['left'] - x.x0) < 5 * height:
                                if pre_flag and abs(
                                        pre_sent['height'] -
                                        height) < eps and pre_sent[
                                            'width'] >= x.width - height * 5:
                                    nresults = pre_sent['text'] + nresults
                                if (item['y0'] -
                                        x.y0) > 4 * height and nresults[
                                            0] != '\n':  #平行段落之间添加换行符
                                    nresults = '\n' + nresults
                                item['text'] += (nresults)
                                if x.x0 > item['left']:
                                    item['left'] = x.x0
                                if x.y0 < item["y0"]:
                                    item["y0"] = x.y0
                                inserted = True
                                break
                        if not inserted:
                            if pre_flag and abs(
                                    pre_sent['height'] -
                                    height) < eps and pre_sent[
                                        'width'] >= x.width - height * 5:
                                nresults = pre_sent['text'] + nresults
                            text_dic_list.append({
                                'hide': height,
                                'left': x.x0,
                                'width': x.width,
                                'text': nresults,
                                "y0": x.y0
                            })
                        pre_flag = False
                        post_flag = True
                    else:
                        fun_flag = False  #处理pdf文段最后一句话不被添加至句子中
                        if post_flag == True:
                            nresults = spe_pun_drop(results)
                            for item in text_dic_list:
                                if (abs(item['hide'] - (height)) < eps and
                                        abs(item['left'] - x.x0) < height * 5):
                                    item['text'] += (nresults)
                                    fun_flag = True
                                    break
                            if fun_flag == False:  #处理连续的末尾没有标点
                                post_flag = False
                        if not fun_flag:
                            if pre_flag and abs(
                                    pre_sent['height'] - height) < eps and abs(
                                        pre_sent['left'] -
                                        x.x0) < height * 5:  #处理有多行之间没有标点等情况
                                pre_sent['text'] += spe_pun_drop(results)
                            else:
                                pre_sent['text'] = spe_pun_drop(results)
                                pre_sent['height'] = height
                                pre_sent['left'] = x.x0
                                pre_sent['width'] = x.width
                            pre_flag = True
            for item in text_dic_list:
                f = open(key[:-4] + '.txt', 'a', encoding='utf-8')
                f.write(item['text'] + '\n')
                f.close()

        print('对象数量：\n', '页面数：%s\n' % num_page, '图片数：%s\n' % num_image,
              '曲线数：%s\n' % num_curve, '水平文本框：%s\n' % num_TextBoxHorizontal)

示例#2

0

显示文件

def parse_pdf(file_path, method='tika'):
    """
    Given a PDF file complete path, the function parses the file, counts the number of pages and checks if
    it is text-extractable.

    Parameters
    ----------
    file_path: string
        Complete path to output file.
    method: string
        Method used to extract the text: 'pdfminer', 'pypdf', 'tika'.

    Return
    ------
    extracted_text: string
        Text extracted from the document.
    number_of_pages:
        Number of pages of the document.
    """

    if method == 'pdfminer':
        with open(file_path, "rb") as fp:
            # Create parser object to parse the pdf content
            pdf_parser = PDFParser(fp)

            # Store the parsed content in PDFDocument object
            document = PDFDocument(pdf_parser)

            # Check if document is text-extractable or not
            is_extractable = document.is_extractable

            # Check if document is extractable, if not abort
            if not is_extractable:
                raise PDFTextExtractionNotAllowed

            # Create PDFResourceManager object that stores shared resources such as fonts or images
            rsrcmgr = PDFResourceManager()

            # Set parameters for analysis
            laparams = LAParams()

            # Extract the decive to page aggregator to get LT object elements
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)

            # Create interpreter object to process page content from PDFDocument
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            extracted_text = ""
            number_of_pages = 0

            # Process PDF document page by page
            for page in PDFPage.create_pages(document):
                number_of_pages = number_of_pages + 1
                extracted_text += f"[Page {number_of_pages}]\n"

                # As the interpreter processes the page stored in PDFDocument object
                interpreter.process_page(page)

                # The device renders the layout from interpreter
                layout = device.get_result()

                # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
                for lt_obj in layout:
                    if isinstance(lt_obj, LTTextBox) or isinstance(
                            lt_obj, LTTextLine):
                        extracted_text += lt_obj.get_text()

    if method == 'pypdf':
        with open(file_path, 'rb') as f:
            pdf = PdfFileReader(f)
            number_of_pages = pdf.getNumPages()
            extracted_text = ''.join([
                f'[Page {i}]\n' + pdf.getPage(i).extractText()
                for i in range(number_of_pages)
            ])

    if method == 'tika':
        raw = parser.from_file(file_path)
        extracted_text = raw['content']
        number_of_pages = int(raw['metadata']['xmpTPg:NPages'])

    else:
        logging.error(f'Text extractor method {method} not found')

    return extracted_text, number_of_pages

示例#3

0

显示文件

from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
filepath = 'C:/Users/lenovo/Desktop/ACL2020'
list1 = os.listdir(filepath)
list_words = []
for i in range(len(list1)):
    outs = ""
    fp = open(filepath + '/' + list1[i], 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser=parser)
    parser.set_document(doc=doc)
    resource = PDFResourceManager()
    laparam = LAParams()
    device = PDFPageAggregator(resource, laparams=laparam)
    interpreter = PDFPageInterpreter(resource, device)
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        layout = device.get_result()
        for out in layout:
            if hasattr(out, 'get_text'):
                outs = out.get_text() + outs
    outs = outs.lower().replace('\n', '')
    english_pu = ['’', '“', '“']
    punctuation_map = dict((ord(char), None) for char in string.punctuation)
    without_punctuation = outs.translate(punctuation_map)  # 去除文章标点符号
    raw_words = nltk.word_tokenize(
        without_punctuation)  # 将文章进行分词处理,将一段话转变成一个list
    wordnet_lematizer = WordNetLemmatizer()
    words = [wordnet_lematizer.lemmatize(raw_word) for raw_word in raw_words]
    # 去除停用词

示例#4

0

显示文件

    def get_signatures_from_pdf(self, path, year=''):
        codec = 'utf-8'
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr=rsrcmgr, laparams=laparams)
        fp = open(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        pages = PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True)

        temp_pages = []
        for page in pages:
            temp_pages.append(page)

        if not temp_pages:
            return

        first_page = temp_pages[0]
        interpreter.process_page(first_page)
        first_page_layout = device.get_result()
        regulations = self.get_document_info(first_page_layout)

        ignore_words = ['ΟI ΥΠΟΥΡΓΟI', 'ΤΑ ΜΕΛΗ', 'ΟΙ ΥΠΟΥΡΓΟΙ']

        if not regulations:
            return

        signature_sets = []

        # Start from the last page until all the required signature sets are found
        for page in reversed(temp_pages):
            # Get the page's layout
            interpreter.process_page(page)
            page_layout = device.get_result()

            # Split text to line's for easier parsing
            text_lines = self.text_from_layout_objects(page_layout).split("\n")

            # Boolean indicating whether we are currently in a signature set
            # Save the data found
            search_active = False
            persons = []
            names = []
            roles = []
            role = ""
            temp_name = ""

            for line in text_lines:
                line = line.strip()
                if search_active:
                    if self.is_break_point(line):
                        for index, name in enumerate(names):
                            current_role = roles[index] if index < len(
                                roles) else ""
                            persons.append({
                                'name':
                                name,
                                'role':
                                Helper.format_role(current_role)
                            })

                        # Continue searching at next point
                        role = ""
                        temp_name = ""
                        search_active = False

                        if persons:
                            signature_sets.append(persons)
                            persons = []

                            # Break if enough signature sets have been found. Otherwise we'll continue looking for
                            # more in the same page.
                            if len(signature_sets) == len(regulations):
                                break

                    normal_line = Helper.normalize_greek_name(line)

                    if normal_line in ignore_words:
                        continue

                    if '***' in line and normal_line:
                        if role:
                            roles.append(role)
                            role = ""

                        names.append(normal_line)
                    else:
                        role += line


                elif (year in line and Helper.date_match(year).match(line)) \
                        or (str(int(year) - 1) in line and Helper.date_match(str(int(year) - 1)).match(line)) \
                        or line == 'Οι Υπουργοί':
                    search_active = True

            # If the end of page has been reached we save the signatures
            if persons:
                signature_sets.append(persons)

            # When we find enough signature sets we stop parsing pages.
            if len(signature_sets) == len(regulations):
                break

        # Merge regulations and signature sets
        for index, signatures in enumerate(reversed(signature_sets)):
            if index >= len(regulations):
                return
            regulations[index]['signatures'] = signatures

        return regulations

示例#5

0

显示文件

文件： pdf_to_text.py 项目： Pavanaprabhu/pdf_to_text_converter

"""open pdf file
generate interpreter
for each page->interpret it to text n save it
create  a txt file with saved data"""

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import io

pdfpath = 'Downloads\\1pe17cs032_finalresume.pdf'
pdf = open(pdfpath, 'rb')

mem = io.StringIO()

rm = PDFResourceManager()
lp = LAParams()
cnv = TextConverter(rm, mem, laparams=lp)
ip = PDFPageInterpreter(rm, cnv)

for i in PDFPage.get_pages(pdf):
    ip.process_page(i)
    text = mem.getvalue()

file = open("Downloads\\1pe17cs032_finalresume.txt", 'wb')
file.write(text.encode('utf-8'))
print("done")

示例#6

0

显示文件

文件： get_covid_data.py 项目： motya1121/covid19-katsushika

def get_data(setting) -> list:
    '''
    初期化
    '''
    # Layout Analysisのパラメーターを設定。縦書きの検出を有効にする。
    laparams = LAParams()
    # 共有のリソースを管理するリソースマネージャーを作成。
    resource_manager = PDFResourceManager()
    # ページを集めるPageAggregatorオブジェクトを作成。
    device = PDFPageAggregator(resource_manager, laparams=laparams)
    # Interpreterオブジェクトを作成。
    interpreter = PDFPageInterpreter(resource_manager, device)
    pdf_archive_dir = setup_pdf_archive_dir()

    # pdf data
    patient_datas_pdf = []
    patient_datas_old = []
    ret_data = []
    '''
    リスト取得
    '''
    befor_tb_avg = 10000  # (top + bottom)/2

    # 分割されたPDFを1つずつ読み込み・処理
    box_list = []
    for pdf_url in setting.pdf_urls:
        # pdf取得
        pdf_path = os.path.join(pdf_archive_dir, pdf_url.split('/')[-1])
        print(pdf_url)
        with urllib.request.urlopen(pdf_url) as u:
            with open(pdf_path, 'bw') as o:
                o.write(u.read())

        with open(pdf_path, 'rb') as f:
            for page in PDFPage.get_pages(f):
                interpreter.process_page(page)
                layout = device.get_result()

                # ページ内のテキストボックスのリストを取得する。
                boxes = find_textboxes_recursively(layout)

                # テキストボックスの左上の座標の順でテキストボックスをソートする。
                # y1（Y座標の値）は上に行くほど大きくなるので、正負を反転させている。
                boxes.sort(key=lambda b: (-b.y1, b.x0))

                for box in boxes:
                    if is_skip(box.get_text()) is True:
                        if box.get_text().find('#N/A') != -1:
                            box_list = []
                        continue
                    temp_tb_avg = (box.y1 + box.y0) / 2

                    if 15 < befor_tb_avg - temp_tb_avg or befor_tb_avg - temp_tb_avg < -15:
                        box_list.sort(key=lambda b: (b.x0))
                        if len(box_list) == 0:
                            befor_tb_avg = temp_tb_avg
                            box_list = []
                        elif box_list[0].get_text().find(
                                '-1') != -1 or box_list[0].get_text().find(
                                    '○') != -1 or box_list[0].get_text().find(
                                        '(cid:16089)1') != -1:
                            befor_tb_avg = temp_tb_avg
                            box_list = []
                        else:
                            temp_pd = patient_data(box_list)
                            temp_pd.parse_line()
                            if temp_pd.is_error is False:
                                patient_datas_pdf.append(temp_pd)
                                befor_tb_avg = temp_tb_avg
                                box_list = []
                            else:
                                print('error')
                                print(box_list)
                                befor_tb_avg = temp_tb_avg
                                box_list = []
                    befor_tb_avg = temp_tb_avg

                    box_list.append(box)

        # 前のPDFファイルで残されたデータの処理
        box_list.sort(key=lambda b: (b.x0))
        temp_pd = patient_data(box_list)
        temp_pd.parse_line()
        if temp_pd.is_error is False:
            patient_datas_pdf.append(temp_pd)
            befor_tb_avg = temp_tb_avg
            box_list = []
        else:
            print('error')
            print(box_list)
            befor_tb_avg = temp_tb_avg
            box_list = []

    # 最後でデータを処理
    if len(box_list) == 0:
        befor_tb_avg = temp_tb_avg
        box_list = []
    else:
        temp_pd = patient_data(box_list)
        temp_pd.parse_line()
        if temp_pd.is_error is False:
            print(temp_pd.no)
            patient_datas_pdf.append(temp_pd)
            befor_tb_avg = temp_tb_avg
            box_list = []
        else:
            print('error')
            print(box_list)
            befor_tb_avg = temp_tb_avg
            box_list = []
    patient_datas_pdf.reverse()

    # 閲覧不可になったデータの処理
    old_no_range = list(range(1, 12656))
    row_datas = []
    patient_datas_old = []
    with open(
            os.path.dirname(os.path.abspath(__file__)) + "/data/row_data.json",
            "r") as f:
        row_datas = json.load(f)
    for row_data in row_datas:
        if int(row_data['No']) not in old_no_range:
            continue
        temp_patient_data = patient_data()
        temp_patient_data.no = row_data['No']
        temp_patient_data.revealed_dt = dt.strptime(row_data['revealed_dt'],
                                                    '%Y-%m-%d')
        temp_patient_data.old = row_data['old']
        temp_patient_data.sex = row_data['sex']
        temp_patient_data.job = row_data['job']
        temp_patient_data.symptom = row_data['symptom']
        if row_data['appearance_dt'] == '':
            temp_patient_data.appearance_dt = None
        else:
            temp_patient_data.appearance_dt = dt.strptime(
                row_data['appearance_dt'], '%Y-%m-%d')
        if row_data['status_id'] in [1, 2, 3, 4]:
            temp_patient_data.status_id = 7
        else:
            temp_patient_data.status_id = row_data['status_id']
        patient_datas_old.append(temp_patient_data)

    patient_datas = patient_datas_old + patient_datas_pdf
    patient_datas_sorted = sorted(patient_datas, key=lambda x: int(x.no))

    for patient in patient_datas_sorted:
        ret_data.append(patient.export_dict())
    return ret_data

示例#7

0

显示文件

文件： app.py 项目： GreenSpanA/MENU_ONLINE

def uploaded_file(filename):
    # Read fie
    pdf = pdfquery.PDFQuery('UPLOAD_FOLDER/pdf_temp.pdf')
    pdf.load()

    # Save xml tree
    pdf.tree.write('UPLOAD_FOLDER/test.xml', pretty_print=True)
    pq_items = pdf.pq('LTTextBoxVertical, LTTextLineHorizontal')
    items = pd.DataFrame(
        columns=['name', 'x0', 'x1', 'y0', 'y1', 'height', 'width', 'page_num'])

    for pq in pq_items:
        page_pq = next(pq.iterancestors('LTPage'))  # Use just the first ancestor
        page_num = page_pq.layout.pageid

        cur_str_item = str(pq.layout)

        tmp_items = pd.DataFrame([[
            get_name(cur_str_item),
            float(get_coordinates(cur_str_item)[0]),
            float(get_coordinates(cur_str_item)[2]),
            float(get_coordinates(cur_str_item)[1]),
            float(get_coordinates(cur_str_item)[3])
        ]],
            columns=['name', 'x0', 'x1', 'y0', 'y1'])

        # tmp_items['height'] = tmp_items['y1'] - tmp_items['y0']
        # tmp_items['width'] = tmp_items['x1'] - tmp_items['x0']

        tmp_items['height'] = get_diff3(tmp_items['y1'], tmp_items['y0'])
        tmp_items['width'] = get_diff3(tmp_items['x1'], tmp_items['x0'])

        tmp_items['page_num'] = page_num

        items = items.append(tmp_items, ignore_index=True)

    # PDF converted to DF
    items = items.sort_values(['page_num', 'x0', 'y1'], ascending=[True, True, False])
    items.reset_index(inplace=True, drop=True)

    #H destribution
    heights = pd.crosstab(index=items["height"], columns="count")
    heights = heights[heights['count'] > 1]

    cat_h = round3(max(heights[heights['count'] >= min_dish_count].index.values))
    tmp = heights[heights['count'] >= min_dish_count].index.values
    item_h = round3(max(tmp[tmp < cat_h]))

    # Plot all boxes
    pdf_boundary_boxes(
        df=items, path_input='UPLOAD_FOLDER/pdf_temp.pdf', path_output='UPLOAD_FOLDER/temp.pdf', r=50, g=0, b=100)

    ########################      Get categoties ####################################

    cat_list = items[items['height'].between(0.99 * cat_h, 1.01 * cat_h)]
    cat_char_w = cat_list.apply(lambda row: mean_char(row['width'], row['name']), axis=1).median()
    cat_char_w_max = cat_list.apply(lambda row: mean_char(row['width'], row['name']), axis=1).max()

    #Collapse  rows with  cat
    cat_list = collapse_rows(cat_list, sense=1.03)
    cat_list = cat_list.sort_values(['page_num', 'y1', 'x0'], ascending=[True, False, True])

    filter = cat_list["name"] != ' '
    cat_list = cat_list[filter]
    cat_list = cat_list.reset_index(drop=True)

    #Draw categories boxes
    pdf_boundary_boxes(df=cat_list, path_input='UPLOAD_FOLDER/pdf_temp.pdf', show_height=False,
                       show_number=True, path_output='UPLOAD_FOLDER/temp1.pdf')


    #################### Get items ###############################################

    items_list = items[items['height'].between(0.99 * item_h, 1.01 * item_h)]
    items_list = items_list.reset_index(drop=True)
    items_list = collapse_rows(items_list)

    # Delete empty items
    filter = items_list["name"] != ' '
    items_list = items_list[filter]
    items_list = items_list.reset_index(drop=True)

    # Get dishes
    patternDel = "^[0-9 \. \/]+$"
    filter = items_list['name'].str.contains(patternDel)
    dishes_list = items_list[~filter]
    dishes_list = dishes_list.reset_index(drop=True)

    # Dishes to layout
    pdf_boundary_boxes(
        df=dishes_list,
        path_input="UPLOAD_FOLDER/temp1.pdf",
        path_output="UPLOAD_FOLDER/temp_dishes.pdf",
        show_height=False,
        r=0,
        g=0,
        b=230)

    # Get prices
    prices_list = items_list[~items_list.name.isin(dishes_list.name)]
    prices_list = prices_list.reset_index(drop=True)

    # Prices to layout
    pdf_boundary_boxes(
        df=prices_list,
        path_input="UPLOAD_FOLDER/temp_dishes.pdf",
        path_output="UPLOAD_FOLDER/temp_dishes_prices.pdf",
        show_height=False,
        r=230,
        g=0,
        b=0)

    ################################# Second algo ###################################

    fp = open('UPLOAD_FOLDER/pdf_temp.pdf', 'rb')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.get_pages(fp)

    #Show new strucure
    for page in pages:
        print('Processing next page...')
        interpreter.process_page(page)
        layout = device.get_result()
        for lobj in layout:
            if isinstance(lobj, LTTextBox):
                x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()

    #Get cat-s (Only for 1 page)
    cat_n = pd.DataFrame(columns=['name', 'x0', 'x1', 'y0', 'y1', 'height', 'width', 'page_num'])
    for lobj in layout:
        if isinstance(lobj, LTTextBox):
            x0, y1, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text().split("\n")[0]
            x1, y0 = lobj.bbox[2], lobj.bbox[1]
            tmp = cat_list[cat_list['y1'].between(0.97 * y1, 1.03 * y1)].copy()
            tmp['name'] = text
            if len(cat_list[cat_list['y1'].between(0.97 * y1, 1.03 * y1)]['name']) > 0:
                if (text != cat_list[cat_list['y1'].between(0.97 * y1, 1.03 * y1)]['name'].values[0]):
                    tmp['x0'] = x0

            cat_n = cat_n.append(tmp, ignore_index=True)

    #Re-draw new layout with cat-s

    pdf_boundary_boxes(
        df=cat_n,
        show_height=False,
        show_number=True,
        path_input="UPLOAD_FOLDER/temp_dishes_prices.pdf",
        path_output="UPLOAD_FOLDER/temp_cat_n.pdf",
    )

    #Get prices
    pq_items1 = pdf.pq('LTTextLineVertical')

    items1 = pd.DataFrame(
        columns=['name', 'x0', 'x1', 'y0', 'y1', 'height', 'width', 'page_num'])

    for pq in pq_items1:
        page_pq = next(pq.iterancestors('LTPage'))  # Use just the first ancestor
        page_num = page_pq.layout.pageid

        cur_str_item = str(pq.layout)

        tmp_items = pd.DataFrame([[
            get_name(cur_str_item),
            float(get_coordinates(cur_str_item)[0]),
            float(get_coordinates(cur_str_item)[2]),
            float(get_coordinates(cur_str_item)[1]),
            float(get_coordinates(cur_str_item)[3])
        ]],
            columns=['name', 'x0', 'x1', 'y0', 'y1'])

        # tmp_items['height'] = tmp_items['y1'] - tmp_items['y0']
        # tmp_items['width'] = tmp_items['x1'] - tmp_items['x0']

        tmp_items['height'] = get_diff3(tmp_items['y1'], tmp_items['y0'])
        tmp_items['width'] = get_diff3(tmp_items['x1'], tmp_items['x0'])

        tmp_items['page_num'] = page_num

        items1 = items1.append(tmp_items, ignore_index=True)

    items1 = items1.sort_values(['page_num', 'x0', 'y1'], ascending=[True, True, False])
    items1.reset_index(inplace=True, drop=True)

    patternDel = '^ *\d[\d ]*$'
    filter = items1['name'].str.contains(patternDel)
    items1 = items1[filter]
    items1 = items1.reset_index(drop=True)

    prices_n = pd.DataFrame(columns=['name', 'x0', 'x1', 'y0', 'y1'])

    for i in range(0, len(items1)):
        big_prices = items1.iloc[i]['name']
        height_a = items1.iloc[i]['height'] / len(big_prices)

        tmp_len = len(big_prices)
        for j in range(0, tmp_len):
            # tmp_prices_n = pd.DataFrame(columns=['name', 'x0', 'x1', 'y0', 'y1', 'height', 'width', 'page_num'])
            tmp_name = items1.iloc[i]['name'][j]
            y1_temp = items1.iloc[i]['y1'] - j * height_a
            y0_temp = items1.iloc[i]['y0'] + j * height_a
            x0, x1 = items1.iloc[i]['x0'], items1.iloc[i]['x1']

            tmp_prices_n = pd.DataFrame({
                'name': [tmp_name],
                'x0': x0,
                'x1': x1,
                'y0': y0_temp,
                'y1': y1_temp
            },
                index=[0])

            prices_n = prices_n.append(tmp_prices_n, ignore_index=True)

    prices_n = prices_n.sort_values(['x0', 'y1'], ascending=[True, False])
    prices_n.reset_index(inplace=True, drop=True)

    #Draw new layout

    pdf_boundary_boxes(
        df=prices_n,
        path_input="UPLOAD_FOLDER/temp_cat_n.pdf",
        path_output="UPLOAD_FOLDER/temp_dishes_prices_n.pdf",
        show_height=False,
        r=230,
        g=0,
        b=0)


    #return 'Done'
    return send_from_directory(upload_path, 'temp_dishes_prices_n.pdf')

示例#8

0

显示文件

文件： manage_files.py 项目： mortbauer/QualCoder

    def load_file_text(self, import_file):
        """ Import from file types of odt, docx pdf, epub, txt, html, htm.
        """

        text = ""

        # Import from odt
        if import_file[-4:].lower() == ".odt":
            text = self.convert_odt_to_text(import_file)
        # Import from docx
        if import_file[-5:].lower() == ".docx":
            #text = convert(importFile)  # uses docx_to_html
            document = opendocx(import_file)
            list_ = getdocumenttext(document)
            text = "\n".join(list_)
        # Import from epub
        if import_file[-5:].lower() == ".epub":
            book = epub.read_epub(import_file)
            for d in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
                #print(d.get_content())
                bytes_ = d.get_body_content()
                string = bytes_.decode('utf-8')
                text += html_to_text(string) + "\n"
        # import PDF
        if import_file[-4:].lower() == '.pdf':
            fp = open(import_file, 'rb')  # read binary mode
            parser = PDFParser(fp)
            doc = PDFDocument(parser=parser)
            parser.set_document(doc)
            # potential error with encrypted PDF
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            laparams.char_margin = 1.0
            laparams.word_margin = 1.0
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                for lt_obj in layout:
                    if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                        text += lt_obj.get_text()
        # import from html
        if import_file[-5:].lower() == ".html" or import_file[-4:].lower() == ".htm":
            importErrors = 0
            with open(import_file, "r") as sourcefile:
                fileText = ""
                while 1:
                    line = sourcefile.readline()
                    if not line:
                        break
                    fileText += line
                text = html_to_text(fileText)
                QtWidgets.QMessageBox.warning(None, _('Warning'), str(importErrors) + _(" lines not imported"))
        # Try importing as a plain text file.
        if text == "":
            import_errors = 0
            try:
                with open(import_file, "r") as sourcefile:
                    while 1:
                        line = sourcefile.readline()
                        if not line:
                            break
                        try:
                            text += line
                        except Exception as e:
                            #logger.debug("Importing plain text file, line ignored: " + str(e))
                            import_errors += 1
                    if text[0:6] == "\ufeff":  # associated with notepad files
                        text = text[6:]
            except Exception as e:
                QtWidgets.QMessageBox.warning(None, _('Warning'),
                    _("Cannot import ") + str(import_file) + "\n" + str(e))
                return
            if import_errors > 0:
                QtWidgets.QMessageBox.warning(None, _('Warning'),
                    str(import_errors) + _(" lines not imported"))
                logger.warning(import_file + ": " + str(import_errors) + _(" lines not imported"))
        # import of text file did not work
        if text == "":
            QtWidgets.QMessageBox.warning(None, _('Warning'),
                _("Cannot import ") + str(import_file) + "\n" + str(e))
            return
        # Final checks: check for duplicated filename and update model, widget and database
        nameSplit = import_file.split("/")
        filename = nameSplit[-1]
        if any(d['name'] == filename for d in self.source):
            QtWidgets.QMessageBox.warning(None, _('Duplicate file'),
                _("Duplicate filename.\nFile not imported"))
            return
        entry = {'name': filename, 'id': -1, 'fulltext': text, 'mediapath': None, 'memo': "",
        'owner': self.settings['codername'], 'date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
        cur = self.settings['conn'].cursor()
        #logger.debug("type fulltext: " + str(type(entry['fulltext'])))
        cur.execute("insert into source(name,fulltext,mediapath,memo,owner,date) values(?,?,?,?,?,?)",
            (entry['name'],  entry['fulltext'], entry['mediapath'], entry['memo'], entry['owner'], entry['date']))
        self.settings['conn'].commit()
        cur.execute("select last_insert_rowid()")
        id_ = cur.fetchone()[0]
        entry['id'] = id_
        self.parent_textEdit.append(entry['name'] + _(" imported."))
        self.source.append(entry)

示例#9

0

显示文件

文件： utils.py 项目： arifbd2221/ResumeParser

def extract_text_from_pdf(pdf_path):
    '''
    Helper function to extract the plain text from .pdf files

    :param pdf_path: path to PDF file to be extracted (remote or local)
    :return: iterator of string of extracted text
    '''
    # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
    if not isinstance(pdf_path, io.BytesIO):
        # extract text from local pdf file
        with open(pdf_path, 'rb') as fh:
            try:
                for page in PDFPage.get_pages(
                        fh,
                        caching=True,
                        check_extractable=True
                ):
                    resource_manager = PDFResourceManager()
                    fake_file_handle = io.StringIO()
                    converter = TextConverter(
                        resource_manager,
                        fake_file_handle,
                        codec='utf-8',
                        laparams=LAParams()
                    )
                    page_interpreter = PDFPageInterpreter(
                        resource_manager,
                        converter
                    )
                    page_interpreter.process_page(page)

                    text = fake_file_handle.getvalue()
                    yield text

                    # close open handles
                    converter.close()
                    fake_file_handle.close()
            except PDFSyntaxError:
                return
    else:
        # extract text from remote pdf file
        try:
            for page in PDFPage.get_pages(
                    pdf_path,
                    caching=True,
                    check_extractable=True
            ):
                resource_manager = PDFResourceManager()
                fake_file_handle = io.StringIO()
                converter = TextConverter(
                    resource_manager,
                    fake_file_handle,
                    codec='utf-8',
                    laparams=LAParams()
                )
                page_interpreter = PDFPageInterpreter(
                    resource_manager,
                    converter
                )
                page_interpreter.process_page(page)

                text = fake_file_handle.getvalue()
                yield text

                # close open handles
                converter.close()
                fake_file_handle.close()
        except PDFSyntaxError:
            return

示例#10

0

显示文件

    def pdf2txt(self):
        '''
        =============================

        return : str, text File path
        '''

        # input
        password = ''
        pagenos = set()
        maxpages = 0

        # output
        imagewriter = None
        rotation = 0
        codec = 'UTF-8'
        pageno = 1
        scale = 1
        caching = True
        showpageno = True
        laparams = LAParams()

        infp = open(self.input_path, "rb")

        if self.output_path == None:
            self.output_path = self.input_path[:-4] + '_trans.txt'
            outfp = open(self.output_path, "w", encoding='UTF8')
        else:
            outfp = open(self.output_path, "w", encoding='UTF8')

        #page total num
        parser = PDFParser(infp)
        document = PDFDocument(parser)
        page_total_num = resolve1(document.catalog['Pages'])['Count']

        #
        rsrcmgr = PDFResourceManager(caching=caching)

        # pdf -> text converter
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)

        # pdf -> text interpreter
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # pdf -> text start
        with tqdm(total=page_total_num) as pbar:
            for page in PDFPage.get_pages(infp,
                                          pagenos,
                                          maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):

                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)

                pbar.update(1)

        print('[INFO] pdf -> text')

        outfp.close()
        infp.close()

        return self.output_path

示例#11

0

显示文件


# Layout Analysisのパラメーターを設定。縦書きの検出を有効にする。
laparams = LAParams(detect_vertical=False,
                    word_margin=0.5,
                    line_margin=.1,
                    boxes_flow=0)

# 共有のリソースを管理するリソースマネージャーを作成。
resource_manager = PDFResourceManager()

# ページを集めるPageAggregatorオブジェクトを作成。
device = PDFPageAggregator(resource_manager, laparams=laparams)

# Interpreterオブジェクトを作成。
interpreter = PDFPageInterpreter(resource_manager, device)

# 出力用のテキストファイル
output_txt = open('output.txt', 'w')


def print_and_write(txt):
    print(txt)
    output_txt.write(txt)
    output_txt.write('\n')


with open(sys.argv[1], 'rb') as f:
    # PDFPage.get_pages()にファイルオブジェクトを指定して、PDFPageオブジェクトを順に取得する。
    # 時間がかかるファイルは、キーワード引数pagenosで処理するページ番号（0始まり）のリストを指定するとよい。
    for page in PDFPage.get_pages(f):

示例#12

0

显示文件

def parse(_path):
    fp = open(_path, 'rb')  # rb以二进制读模式打开

    # 用文件对象来创建一个pdf文档分析器
    praser_pdf = PDFParser(fp)

    # 创建一个PDF文档
    doc = PDFDocument()

    # 连接分析器 与文档对象
    praser_pdf.set_document(doc)
    doc.set_parser(praser_pdf)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换，不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()

        # 创建一个PDF参数分析器
        laparams = LAParams()

        # 创建聚合器
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # 创建一个PDF页面解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表，每次处理一页的内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            # 使用页面解释器来读取
            interpreter.process_page(page)

            # 使用聚合器获取内容
            layout = device.get_result()

            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性，
            for out in layout:
                # 判断是否含有get_text()方法，图片之类的就没有
                # if hasattr(out,"get_text"):
                if isinstance(out, LTTextBoxHorizontal):

                    results = out.get_text()
                    # print("results: " + results)
                    with open(r'pdf_val.txt', 'a') as f:
                        if "运输完成情况" in results:
                            target_value = results.split("\n")
                            inland_amount = target_value[10]
                            foreign_amount = target_value[12]
                            print("国内货邮运输量：", inland_amount, "国际货邮运输量：",
                                  foreign_amount)
                            f.write("国内货邮运输量:" + inland_amount + ",国际货邮运输量:" +
                                    foreign_amount + "\n")
                            f.close()
                            break

示例#13

0

显示文件

文件： main.py 项目： manavsengupta/textminingproject

def pdftotexts(filename):
    path_to_pdf = filename
    # Load your PDF
    '''
    path_to_pdf: is the parameter that will give access to the PDF File 
    we want to extract the content.
    '''
    '''
    PDFResourceManager is used to store shared resources such as fonts or images that 
    we might encounter in the files. 
    '''

    resource_manager = PDFResourceManager(caching=True)
    '''
    create a string object that will contain the final text the representation of the pdf. 
    '''
    out_text = StringIO()
    '''
    UTF-8 is one of the most commonly used encodings, and Python often defaults to using it.
    In our case, we are going to specify in order to avoid some encoding errors.
    '''
    codec = 'utf-8'
    """
    LAParams is the object containing the Layout parameters with a certain default value. 
    """
    laParams = LAParams()
    '''
    Create a TextConverter Object, taking :
    - ressource_manager,
    - out_text 
    - layout parameters.
    '''
    text_converter = TextConverter(resource_manager,
                                   out_text,
                                   laparams=laParams)
    fp = open(path_to_pdf, 'rb')
    '''
    Create a PDF interpreter object taking: 
    - ressource_manager 
    - text_converter
    '''
    interpreter = PDFPageInterpreter(resource_manager, text_converter)
    '''
    We are going to process the content of each page of the original PDF File
    '''
    for page in PDFPage.get_pages(fp,
                                  pagenos=set(),
                                  maxpages=0,
                                  password="",
                                  caching=True,
                                  check_extractable=True):
        interpreter.process_page(page)
    '''
    Retrieve the entire contents of the “file” at any time 
    before the StringIO object’s close() method is called.
    '''
    text = out_text.getvalue()
    '''
    Closing all the ressources we previously opened
    '''
    fp.close()
    text_converter.close()
    out_text.close()

    with open(str(filename)[:-4] + "_text_generatedbymain.py.txt", 'w') as f:
        f.write(text)
        f.close()

示例#14

0

显示文件

def parsePDF(fileLocation, politician, party, topic, useEmptyDB):
    data = ''
    # Open files containing indications of fillers around quotes, and non-article text
    quoteRelatedFillers = open('../Resources/quoteRelatedFillerWords.txt',
                               'r',
                               encoding='utf-8').readline().split(',')
    nonArticleFlags = open('../Resources/nonArticleFlags.txt',
                           'r',
                           encoding='utf-8').readline().split(',')
    quoteFillers = {'-', '»', '«'}
    wrongQuoteFlags = set()
    correctQuoteFlags = set()
    upcomingCorrectQuoteFlags = set()
    politicianLastName = politician.split(' ')[-1]

    # Generate quote fillers to be extracted, flags indicating a quote is by another than the politician of interest,
    # that the quote is by the politician of interest or that an upcoming quote is of interest, pairing quote fillers,
    # pronouns and the name of the politician in different combinations
    for filler in quoteRelatedFillers:
        quoteFillers.update([
            ', ' + filler + '.*' + politician + '.*', ', ' + filler + ' hun.*',
            ', ' + filler + ' han.*',
            ', ' + filler + '.*' + politicianLastName + '.*'
        ])
        # Statement made by someone other than the given politician
        wrongQuoteFlags.update([
            ', ' + filler + ' (?!.*' + politician + '|.*hun|.*han|.*' +
            politicianLastName + ').*'
        ])
        correctQuoteFlags.update([
            ', ' + filler + ' .*' + politician + '.*',
            ', ' + filler + ' .*' + politicianLastName + '.*'
        ])
        upcomingCorrectQuoteFlags.update([
            filler + '[ |,].*' + politician, politician + '[ |,].*' + filler,
            filler + '[ |,].*' + politicianLastName,
            politicianLastName + '[ |,].*' + filler
        ])

    # Open the indicated PDF file.
    fp = open(fileLocation, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data = retstr.getvalue()

    # Removing null bytes, generated by "ft", "tf" and "ff"
    data = data.replace('\0', '')
    newArticle = True
    quotes = []
    articleTitle, articleText, date = '', '', ''
    quoteCount, articleCount, articleID = 0, 0, 0

    # Use empty dataset files
    if useEmptyDB:
        quoteDB = pd.read_csv('../out/empty_db/quote_db.csv',
                              sep=';',
                              encoding='UTF-8',
                              header=0)
        articleDB = pd.read_csv('../out/empty_db/article_db.csv',
                                sep=';',
                                encoding='UTF-8',
                                header=0)

    # Open already present dataset files
    else:
        quoteDB = pd.read_csv('../out/quote_db.csv',
                              sep=';',
                              encoding='UTF-8',
                              header=0)
        articleDB = pd.read_csv('../out/article_db.csv',
                                sep=';',
                                encoding='UTF-8',
                                header=0)

    # Continue from largest article and quote id, if article and/or quote dataset is already populated
    maxArticleID = articleDB['articleID'].max()
    articleID = maxArticleID + 1 if not math.isnan(maxArticleID) else 1
    maxQuoteID = quoteDB['quoteID'].max()
    quoteID = maxQuoteID + 1 if not math.isnan(maxQuoteID) else 1

    quoteDicts = []
    articleDicts = []

    # Flags indicating special cases in which quotes might appear
    correctUpcomingQuote = False
    quotesInALine = False

    # Isolate paragraphs
    for paragraph in data.split('\n\n'):
        paragraph = paragraph.replace('\n', ' ')

        # Save article publication dates
        if 'Id:' in paragraph:
            date = re.compile('\\w* \\d{2}, \\d{4}').search(paragraph).group()
            strpDate = datetime.strptime(date, '%B %d, %Y')
            date = strpDate.strftime('%m/%d/%Y')

        # Skip non-article text
        if any(flag in paragraph for flag in nonArticleFlags) or re.search('\\d+\\W\\d+\\W\\d{4}', paragraph) \
                or re.search('^\\d+/\\d+$', paragraph) or paragraph == 'København':
            continue

        # Identify and extract quotes
        if paragraph.startswith('- ') or paragraph.startswith('»'):
            paragraph = paragraph.replace('«', '')

            # Ignore quote if not from politician in question
            wrongQuote = False
            correctQuote = False
            for wrongQuoteFlag in wrongQuoteFlags:
                if re.search(wrongQuoteFlag, paragraph):
                    wrongQuote = True

                    # Reset flag to avoid false positive indication of quote of interest
                    quotesInALine = False

            if not wrongQuote:
                # Identify whether quote is of interest
                for correctQuoteFlag in correctQuoteFlags:
                    if re.search(correctQuoteFlag, paragraph):
                        correctQuote = True

                # Remove 'fillers' around quotes, such as ', siger Martin Henriksen' and strip whitespace
                if correctQuote or correctUpcomingQuote or quotesInALine:
                    quotesInALine = True
                    for quoteFiller in quoteFillers:
                        paragraph = re.sub(quoteFiller, '', paragraph)
                    paragraph = paragraph.strip()
                    quotes.append(paragraph)
        # Catch multiple quotes in a row, in question-answer chain
        elif not paragraph.startswith('Spørgsmål: '):
            quotesInALine = False

        correctUpcomingQuote = False

        # Check if paragraph insinuate an upcoming quote of interest
        for upcomingCorrectQuoteFlag in upcomingCorrectQuoteFlags:
            if re.search(upcomingCorrectQuoteFlag, paragraph):
                correctUpcomingQuote = True

        if re.search(politician + '.*:$', paragraph.strip()):
            correctUpcomingQuote = True

        # Identify the title of the article
        if newArticle and not re.search('\\d+\\W\\d+\\W\\d{4}', paragraph):
            articleTitle = paragraph
            newArticle = False
            continue

        # Construct article string from paragraphs, excluding non-article paragraphs generated during PDF extraction
        articleText += paragraph

        # End of article indicated with 'The client may distribute'
        if paragraph.startswith('The client may distribute'):
            # Save quotes with info in quote dataset, and increment quote ID and quote count
            for quote in quotes:
                quoteDicts.append({
                    'quoteID': quoteID,
                    'quote': quote,
                    'politician': politician,
                    'date': date,
                    'party': party,
                    'articleID': articleID.__str__(),
                    'topic': topic,
                    'fan': '',
                    'articleText': articleText
                })
                quoteCount = quoteCount + 1
                quoteID = quoteID + 1

            # Save article with articleID in article dataset, and increment article ID and article count
            articleDicts.append({
                'articleID': articleID,
                'topic': topic,
                'articleTitle': articleTitle,
                'articleText': articleText,
                'mediaOutlet': 'ritzau'
            })
            articleID = articleID + 1
            articleCount = articleCount + 1

            # Reset article text and quotes in article, and indicate the start of a new article
            newArticle = True
            quotes.clear()
            articleText = ''

    # Append newly parsed quotes and articles to exisiting database
    quoteDB = quoteDB.append(pd.DataFrame(quoteDicts), sort=False)
    articleDB = articleDB.append(pd.DataFrame(articleDicts), sort=False)

    # Remove quote and article duplicates
    quoteDB.drop_duplicates(subset=['quote', 'politician'], inplace=True)
    articleDB.drop_duplicates(subset=['articleText'], inplace=True)

    print('Quotes for politician:', quoteCount,
          '\nArticles for politician:', articleCount, '\nTotal quotes:',
          len(quoteDB.index), '\nTotal articles:', len(articleDB.index))

    # Save updated databases
    quoteDB.to_csv('../out/quote_db.csv',
                   sep=';',
                   encoding='UTF-8',
                   index=False,
                   quoting=1)
    articleDB.to_csv('../out/article_db.csv',
                     sep=';',
                     encoding='UTF-8',
                     index=False,
                     quoting=1)

示例#15

0

显示文件

    def exportPDF(self, infile, outfile):
        # debug option
        #debug = 0
        # input option
        password = ''
        pagenos = set()
        maxpages = 0
        # output option
        #outfile = None
        outtype = 'text'
        imagewriter = None
        rotation = 0
        layoutmode = 'normal'
        codec = 'utf-8'
        pageno = 1
        scale = 1
        caching = True
        showpageno = True
        laparams = LAParams()
        '''for (k, v) in opts:
			if k == '-d': debug += 1
			elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
			elif k == '-m': maxpages = int(v)
			elif k == '-P': password = v
			elif k == '-o': outfile = v
			elif k == '-C': caching = False
			elif k == '-n': laparams = None
			elif k == '-A': laparams.all_texts = True
			elif k == '-V': laparams.detect_vertical = True
			elif k == '-M': laparams.char_margin = float(v)
			elif k == '-L': laparams.line_margin = float(v)
			elif k == '-W': laparams.word_margin = float(v)
			elif k == '-F': laparams.boxes_flow = float(v)
			elif k == '-Y': layoutmode = v
			elif k == '-O': imagewriter = ImageWriter(v)
			elif k == '-R': rotation = int(v)
			elif k == '-t': outtype = v
			elif k == '-c': codec = v
			elif k == '-s': scale = float(v)'''
        '''#
		PDFDocument.debug = debug
		PDFParser.debug = debug
		CMapDB.debug = debug
		PDFResourceManager.debug = debug
		PDFPageInterpreter.debug = debug
		PDFDevice.debug = debug
		#'''
        rsrcmgr = PDFResourceManager(caching=caching)
        if outfile:
            outfp = file(outfile, 'w')
        else:
            outfp = sys.stdout
        #outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
        #for fname in args:
        fp = file(infile, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
        device.close()
        outfp.close()
        return

示例#16

0

显示文件

文件： CRM_Trello_Link.py 项目： maneshadslv/AtlassTools

 def flight_plan_reader(self):
     plan_headers = [
         'Strip width', 'Lateral overlap', 'Run spacing', 'Forward overlap',
         'Photo base', 'Total length', 'Total lines', 'Total photos',
         'Planned By', 'Survey Time'
     ]
     for key in self.flight_plan_dic.keys():
         flight_plan_name = key
         file_path = self.flight_plan_dic[key][-1]
         resource_manager = PDFResourceManager()
         fake_file_handle = io.StringIO()
         converter = TextConverter(resource_manager, fake_file_handle)
         page_interpreter = PDFPageInterpreter(resource_manager, converter)
         with open(file_path, 'rb') as file:
             for page in PDFPage.get_pages(file,
                                           caching=True,
                                           check_extractable=True):
                 page_interpreter.process_page(page)
             text = fake_file_handle.getvalue()
         converter.close()
         fake_file_handle.close()
         page_split_output = text.split(
             ' UNCONTROLLED DOCUMENT WHEN PRINTED ')
         first_page = page_split_output[0]
         script = None
         params_index = OrderedDict()
         param_values = {}
         backup_sys = None
         if 'VQ' in self.flight_plan_dic[key][0]:
             param_values['System'] = 'VQ'
             backup_sys = 'VQ'
             script = first_page[first_page.find('LiDAR'):first_page.
                                 rfind('LiDAR')]
             if len(script) == 0:
                 # print(first_page)
                 try:
                     script = first_page[
                         first_page.find('Lidar:'):first_page.
                         rfind('Images:Collect images')].split(':')[1]
                 except IndexError:
                     script = None
         elif 'H68' in self.flight_plan_dic[key][0]:
             script = 'H68 Standard'
             param_values['System'] = 'Harrier'
             backup_sys = 'H68'
         if script is None:
             print(
                 "You will need to enter flight plan info manually; non-standard setup found."
             )
         else:
             last_page = page_split_output[-1]
             for param in plan_headers:
                 params_index[param] = last_page.find(param)
                 """
                 if last_page.find(param) == -1:
                     params_index[param] = None"""
             for i, param in enumerate(plan_headers):
                 param_value = None
                 if i < (len(plan_headers) - 1):
                     # print(param, "has i", i, "and next param is", plan_headers[i+1])
                     next_param = plan_headers[i + 1]
                     index_current = params_index[param]
                     index_next = params_index[next_param]
                     cut_up_string = last_page[index_current:index_next]
                     param_value = cut_up_string.split(':')[-1].strip(' ')
                     # print(param_value)
                 elif i == (len(plan_headers) - 1):
                     # print("Final param is", param)
                     index_current = params_index[param]
                     end = last_page.find('[')
                     cut_up_string = last_page[index_current:end]
                     param_value = cut_up_string.split(':')[-1].strip(' ')
                     # print(param_value)
                 param_values[param] = param_value
                 param_values['script'] = script
                 param_values['initial_vals'] = self.flight_plan_dic[
                     flight_plan_name]
             param_values['System'] = backup_sys
             self.flight_plan_dic[flight_plan_name] = param_values

示例#17

0

显示文件

文件： convert.py 项目： SimmonsRitchie/court_docket_scraper

def convert_pdf_to_text(pdf_path: Union[object, str], docketnum: str) -> str:
    """ Takes path (or pathlib Path object) to a PDF file, docketnum and
    returns text inside PDF"""

    # SET PATHS
    extracted_text_path = extracted_text_path_gen(dirs["extracted_text"],
                                                  docketnum)

    logging.info(f"Converting pdf to text for docket {docketnum}...")
    password = ""
    extracted_text = ""

    # Open and read the pdf file in binary mode
    fp = open(pdf_path, "rb")

    # Create parser object to parse the pdf content
    parser = PDFParser(fp)

    # Store the parsed content in PDFDocument object
    try:
        document = PDFDocument(parser, password)
    except Exception as e:
        logging.error("Something went wrong during conversion")
        logging.exception(e)
        logging.info(
            "Returning no extracted text for docket {}".format(docketnum))
        return extracted_text

    # Check if document is extractable, if not abort
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create PDFResourceManager object that stores shared resources such as fonts or images
    rsrcmgr = PDFResourceManager()

    # set parameters for analysis
    laparams = LAParams()

    # Create a device object which translates interpreted information into desired format
    # Device needs to be connected to resource manager to store shared resources
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create interpreter object to process page content from PDFDocument
    # Interpreter needs to be connected to resource manager for shared resources and device
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Ok now that we have everything to process a pdf document, lets process it page by page
    for page in PDFPage.create_pages(document):
        # As the interpreter processes the page stored in PDFDocument object
        interpreter.process_page(page)
        # The device renders the layout from interpreter
        layout = device.get_result()
        # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()
    # close the pdf file
    fp.close()

    with open(extracted_text_path, "wb") as fout:
        fout.write(extracted_text.encode("utf-8"))
    logging.info("Text extracted successfully")
    return extracted_text

示例#18

0

显示文件

def pdf_to_txt_miner(folder, password):
    #获取指定目录下的所有文件
    files = os.listdir(folder)
    pdfFiles = [f for f in files if f.endswith('.pdf')]

    #获取pdf类型的文件，放到一个列表中
    for pdfFile in pdfFiles:
        print(pdfFile)
        #将目录和文件合并成一个路径 os.path.join('root','test','runoob.txt')  ##root/test/runoob.txt
        pdfPath = os.path.join(folder, pdfFile)
        #设置将要转换后存放word文件的路径
        wdPath = os.path.join(txtpath, pdfFile)
        #判断是否已经存在对应的文件，如果不存在就加入到存放的路径中去
        if wdPath[-4:] != '.txt':
            wdPath = wdPath + '.txt'
        fn = open(path + "/{}".format(pdfFile), 'rb')
        #创建一个PDF文本档分析器：PDFParser
        parser = PDFParser(fn)
        #创建一个PDF文档：PDFDocumeng
        doc = PDFDocument()
        #链接分析器与文档
        parser.set_document(doc)
        doc.set_parser(parser)
        #提供出事话的密码，如果没有密码，输入空字符串
        doc.initialize('')
        #检测文档是否提供txt转换，不提供就直接忽略
        if not doc.is_extractable:
            print('PDFTextExtractionNotAllowed')
        else:
            #创建PDF资源管理器：PDFResourceManager
            resource = PDFResourceManager()
            #创建一个PDF参数分析器：；AParams
            laparams = LAParams()
            #创建聚合器，用于读取文档的对象：PDFPageAggregator
            device = PDFPageAggregator(resource, laparams=laparams)
            #创建解释器，对文档编码，解释成python能够识别的格式：PDFPageInterpreter
            interpreter = PDFPageInterpreter(resource, device)
            #doc.get_pages()是获取page列表的一个方法
            num_page, num_image, num_Text = 0, 0, 0
            num = 0
            for page in doc.get_pages():

                # num+=1;
                # if (num!=2):
                #     continue;
                pdf_str = ''
                #利用解释器的peocess_page()方法解析单独页数
                interpreter.process_page(page)
                layout = device.get_result()
                for out in layout:
                    if isinstance(out, LTTextBoxHorizontal):
                        num_Text += 1
                        # print(type(out.get_text()))
                        # if out.get_text().strip().replace(" ","")=="摘要":
                        #     print("找到摘要位置")
                        pdf_str += out.get_text().strip()
                        f = open(wdPath, 'a', encoding='utf-8')
                        f.write(out.get_text() + '\n')
                    if isinstance(out, LTImage):
                        num_image += 1
                        f = open(wdPath, 'a', encoding='utf-8')
                        f.write(out.get_text() + '\n')

示例#19

0

显示文件

    #maxpages = 0
    manager = PDFResourceManager(caching=caching)
    if case == 'txt':
        output = io.StringIO()
        converter = TextConverter(manager,
                                  output,
                                  codec=codec,
                                  laparams=laparams)
    if case == 'HTML':
        output = io.BytesIO()
        converter = HTMLConverter(manager,
                                  output,
                                  codec=codec,
                                  laparams=laparams)

    interpreter = PDFPageInterpreter(manager, converter)
    infile = open(fname, 'rb')

    for index, page in enumerate(
            PDFPage.get_pages(infile,
                              pagenums,
                              caching=caching,
                              check_extractable=True)):
        interpreter.process_page(page)

    convertedPDF = output.getvalue()

    infile.close()
    converter.close()
    output.close()

示例#20

0

显示文件

def pdfread(pdfPath):
    with open(pdfPath, 'rb') as fp:
        try:

            print(pdfPath)
            #用文件对象创建一个PDF文档分析器
            parser = PDFParser(fp)
            #创建一个PDF文档
            doc = PDFDocument()
            #分析器和文档相互连接
            parser.set_document(doc)
            doc.set_parser(parser)
            #提供初始化密码，没有默认为空
            doc.initialize()
            #检查文档是否可以转成TXT，如果不可以就忽略
            if not doc.is_extractable:
                raise PDFTextExtractionNotAllowed
            else:
                #创建PDF资源管理器，来管理共享资源
                rsrcmagr = PDFResourceManager()
                #创建一个PDF设备对象
                laparams = LAParams()
                #将资源管理器和设备对象聚合
                device = PDFPageAggregator(rsrcmagr, laparams=laparams)
                #创建一个PDF解释器对象
                interpreter = PDFPageInterpreter(rsrcmagr, device)
                allContent = ''
                last_para = ''
                result = ''
                for page in doc.get_pages():
                    interpreter.process_page(page)
                    #接收该页面的LTPage对象
                    layout = device.get_result()
                    #这里的layout是一个LTPage对象 里面存放着page解析出来的各种对象
                    #一般包括LTTextBox，LTFigure，LTImage，LTTextBoxHorizontal等等一些对像
                    #想要获取文本就得获取对象的text属性

                    for x in layout:
                        try:
                            if (isinstance(x, LTTextBoxHorizontal)):
                                result = x.get_text()
                                # 去掉pdf文件读取的各种换行符
                                result = result.replace('\n', '')
                                # 去掉参考文献引用
                                result = re.sub('\[(\d+\,* ?-?)+\]', '',
                                                result)
                                # 无序列表换行
                                result = result.replace('∙', '\n∙')
                                # 去掉参考文献
                                if re.findall(
                                        '^references?',
                                        last_para.lower().replace(
                                            ' ', '')) != [] or re.findall(
                                                '^references?',
                                                result.lower().replace(
                                                    ' ', '')) != []:
                                    return allContent
                                # 去掉页脚页码页眉以及内容过少的表格
                                if re.findall(
                                        '^Authorized licensed use limited to:',
                                        result) == [] and re.findall(
                                            '©', result) == [] and re.findall(
                                                'Publication date',
                                                result) == [] and re.findall(
                                                    '\d\:\d', result
                                                ) == [] and re.findall(
                                                    '(et al.)$', result
                                                ) == [] and len(result) > 5:
                                    allContent = allContent + '\n' + result
                                # print(result)
                        except Exception as e:
                            print(e)
                        last_para = result
                return allContent
        except Exception as e:
            print('文档读取失败：' + str(e))

示例#21

0

显示文件

    def convert_pdf_to_txt(self, path):
        start = timer()
        codec = 'utf-8'
        rsrcmgr = PDFResourceManager()
        retstr = io.StringIO()
        device = TextConverter(rsrcmgr,
                               retstr,
                               codec=codec,
                               laparams=self.laparams)
        fp = open(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        pages = PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True)

        # Analyze first page to get a feel of what's going on
        try:
            first_page = next(pages)
            interpreter.process_page(first_page)
        except StopIteration:
            print("The pdf document may be damaged")
            return

        # Save pages to RAM to interpret only the last 3 ones
        temp_pages = []

        # Get the first page's text
        text = retstr.getvalue()
        num_signature_points = 1
        if 'ΠΕΡΙΕΧΟΜΕΝΑ' in text:
            indexes = re.findall('[0-9] \n', text[120:350])
            num_signature_points = len(indexes)

        for page in pages:
            temp_pages.append(page)

        # Goes through the pages in reverse until if finds the stopword(s)
        signature_points_found = 0
        for page in reversed(temp_pages):
            interpreter.process_page(page)
            current_text = retstr.getvalue()

            if 'Οι Υπουργοί' in current_text or Helper.date_match().findall(current_text) \
                    or 'ΟΙ ΥΠΟΥΡΓΟΙ' in current_text:

                signature_points_found += 1

            if signature_points_found == num_signature_points:
                break

        text = retstr.getvalue()

        fp.close()
        device.close()
        end = timer()
        print("{} seconds elapsed for parsing this pdf's text.".format(end -
                                                                       start))
        return text

示例#22

0

显示文件

文件： scrape_japanese_nlppaper.py 项目： s14t284/Collecting_NLPPaper

def extract_introduction(proceedings: List[Dict[str, Any]]):
    """
    extract introductions from the proceedings papers
    Args:
        proceedings (List[Dict[str, Any]]):
         {'session': ●●●, 'title': ●●●, 'url': ●●●, 'authors': [●●●, ...], }
    """

    chaps = ['はじめに', '序論', '背景', '背景と目的', 'Introduction']

    laparams = LAParams()
    laparams.detect_vertical = True
    for paper_dict in tqdm.tqdm(proceedings):
        manager = PDFResourceManager()
        paper_pdf = requests.get(paper_dict['url'])
        instr = BytesIO()
        instr.write(paper_pdf.content)
        outstr = StringIO()
        with TextConverter(manager, outstr, laparams=laparams) as device:
            interpreter = PDFPageInterpreter(manager, device)
            try:
                for page in PDFPage.get_pages(instr,
                                              set(),
                                              maxpages=1,
                                              caching=True,
                                              check_extractable=True):
                    interpreter.process_page(page)
                first = outstr.getvalue()
                print(first)
                intro = ''
                for chap in chaps:
                    cn = '1 {}'.format(chap)
                    if cn in first:
                        top = first.find(cn) + len(cn)
                        if '．\n\n2' in first:
                            intro = first[top:first.find('．\n\n2')]
                        elif '.\n\n2' in first:
                            intro = first[top:first.find('.\n\n2')]
                        elif '。\n\n2' in first:
                            intro = first[top:first.find('。\n\n2')]
                        else:
                            intro = first[top:]
                        break
                    cn = '1{}'.format(chap)
                    if cn in first:
                        top = first.find(cn) + len(cn)
                        if '．2' in first:
                            intro = first[top:first.find('．2')]
                        elif '.2' in first:
                            intro = first[top:first.find('.2')]
                        elif '。2' in first:
                            intro = first[top:first.find('。2')]
                        else:
                            intro = first[top:]
                        break
            except Exception as e:
                logger.error('error: {} url:{}'.format(e.args,
                                                       paper_dict['url']))

        # intro = intro.replace('\r', '').encode('utf-8').decode()
        intro = re.sub(r'\n+', '\n', intro)
        paper_dict['introduction'] = intro
        time.sleep(1.5 + random.random())
        instr.close()
        outstr.close()

示例#23

0

显示文件

文件： WK_Main_FinalV08.py 项目： manojreddy3210/cognitive

def PDFReader(path, pages=None):
    PagePosDict = []
    PagePosDict1 = []
    PagePosDictCord = defaultdict()
    PagePosDictPages = defaultdict()
    PagePosDictJuris = []
    all_pages = []
    all_pages_juris = []
    all_page_set = []
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    manager = PDFResourceManager()
    fd = open(path, 'rb')
    page_no = 0
    for page in PDFPage.get_pages(fd, pagenums):
        output = StringIO()
        converter = TextConverter(manager, output, laparams=LAParams())
        interpreter = PDFPageInterpreter(manager, converter)
        interpreter.process_page(page)
        doc = output.getvalue()
        all_pages.append(doc)
        if doc[:18] == "Service of Process":
            continue
        else:
            page_no = page_no + 1
            if page_no <= 5:
                all_pages_juris.append(doc)
        converter.close()
        output.close()
    all_page_set.append(' '.join(all_pages_juris))
    fd.close()

    i = 0
    pdf = pdfquery.PDFQuery(path)
    try:
        while i <= 30:
            pdf.load(i)
            JQuery = pdf.pq('LTPage')

            if JQuery.text().find('Service of Process Transmittal') >= 0:
                i += 1
                continue
            j = 0
            LineLength = len(JQuery("LTTextLineHorizontal"))
            BoxLength = len(JQuery("LTTextBoxHorizontal"))
            if LineLength == 0:
                return 1, PagePosDict, PagePosDict1, PagePosDictCord, all_pages, all_page_set, PagePosDictJuris, PagePosDictPages

            if LineLength < BoxLength:
                NetLength = BoxLength
            else:
                NetLength = LineLength
            PagePosDictPage = defaultdict()
            while (j < NetLength):

                if j < LineLength and i <= 30:
                    PagePosDict.append(
                        JQuery(JQuery("LTTextLineHorizontal")[j]).text())
                    cordinates = list()
                    cordinates.append(i)
                    cord = JQuery(
                        JQuery("LTTextLineHorizontal")[j]).attr('bbox')
                    for a in ['[', ']']:
                        cord = cord.replace(a, '')
                    for a in cord.split(', '):
                        cordinates.append(float(a))
                    PagePosDictCord[tuple(cordinates)] = JQuery(
                        JQuery("LTTextLineHorizontal")[j]).text()
                if j < BoxLength and i <= 30:
                    PagePosDict1.append(
                        JQuery(JQuery("LTTextBoxHorizontal")[j]).text())
                    cordinates = list()
                    cordinates.append(i)
                    cord = JQuery(
                        JQuery("LTTextBoxHorizontal")[j]).attr('bbox')
                    for a in ['[', ']']:
                        cord = cord.replace(a, '')
                    for a in cord.split(', '):
                        cordinates.append(float(a))
                    PagePosDictPage[tuple(cordinates)] = JQuery(
                        JQuery("LTTextBoxHorizontal")[j]).text()
                if j < BoxLength and i <= 7:
                    PagePosDictJuris.append(
                        JQuery(JQuery("LTTextBoxHorizontal")[j]).text())
                j += 1
            PagePosDictPages[i] = PagePosDictPage
            i += 1

    except Exception, e:
        return 0, PagePosDict, PagePosDict1, PagePosDictCord, all_pages, all_page_set, PagePosDictJuris, PagePosDictPages

示例#24

0

显示文件

文件： views.py 项目： renukartamboli/fileupload

    def post(self, request, *args, **kwargs):

        file_serializer = FileSerializer(data=request.data)
        if file_serializer.is_valid():
            file_serializer.save()
            file_path = "D:/file/fileupload" + file_serializer.data["file"]
            FilePointer = open(file_path, "r")
            t = file_path.split(".")
            if (t[1] == "pdf"):

                bt1 = [
                    'define', 'describe', 'draw', 'find', 'identify', 'label',
                    'list', 'locate', 'match', 'memorise', 'name', 'recall',
                    'recite', 'recognize', 'relate', 'reproduce', 'select',
                    'state', 'tell', 'write'
                ]
                bt2 = [
                    'compare', 'convert', 'demonstarte', 'describe', 'discuss',
                    'distinguish', 'explain',
                    'find out more information about', 'generalize',
                    'interpret', 'outline', 'paraphrase', 'predict',
                    'put into your own words', 'relate', 'restate',
                    'summarize', 'translate', 'visualize'
                ]
                bt3 = [
                    'apply', 'calculate', 'change', 'choose', 'complete',
                    'construct', 'examine', 'illustrate', 'interpret', 'make',
                    'manipulate', 'modify', 'produce', 'put into practice',
                    'put together', 'solve', 'show', 'translate', 'use'
                ]
                bt4 = [
                    'advertise', 'analyse', 'categoriase', 'compare',
                    'contrast', 'deduce', 'differenciate', 'distinguish',
                    'examine', 'explain', 'identify', 'investigate',
                    'seperate', 'subdivide', 'take apart'
                ]
                bt5 = [
                    'argue', 'assess', 'choose', 'compose', 'construct',
                    'create', 'criticise', 'critique', 'debate', 'decide',
                    'defend', 'design', 'determine', 'device', 'discuss',
                    'estimate', 'evaluate', 'formulate', 'imagine', 'invent',
                    'judge', 'justify', 'plan', 'predict', 'prioritise',
                    'propose', 'rate', 'recommend', 'select', 'value'
                ]
                bt6 = [
                    'add to', 'argue', 'assess', 'choose', 'combine',
                    'compose', 'construct', 'create', 'debate', 'decide',
                    'design', 'determine', 'devise', 'discuss', 'forcast',
                    'formulate', 'hypothesise', 'imagine', 'invent', 'judge',
                    'justify', 'originate', 'plan', 'predict', 'priortise',
                    'propose', 'rate', 'recommend', 'select', 'verify'
                ]
                bt = {
                    'bt1': bt1,
                    'bt2': bt2,
                    'bt3': bt3,
                    'bt4': bt4,
                    'bt5': bt5,
                    'bt6': bt6
                }
                my_file = os.path.join(file_path)
                log_file = os.path.join("D:/file/fileupload/media/log.txt")

                password = ""
                extracted_text = ""

                # Open and read the pdf file in binary mode
                fp = open(my_file, "rb")

                # Create parser object to parse the pdf content
                parser = PDFParser(fp)

                # Store the parsed content in PDFDocument object
                document = PDFDocument(parser, password)

                # Check if document is extractable, if not abort
                if not document.is_extractable:
                    raise PDFTextExtractionNotAllowed

                # Create PDFResourceManager object that stores shared resources such as fonts or images
                rsrcmgr = PDFResourceManager()

                # set parameters for analysis
                laparams = LAParams()

                # Create a PDFDevice object which translates interpreted information into desired format
                # Device needs to be connected to resource manager to store shared resources
                # device = PDFDevice(rsrcmgr)
                # Extract the decive to page aggregator to get LT object elements
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)

                # Create interpreter object to process page content from PDFDocument
                # Interpreter needs to be connected to resource manager for shared resources and device
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                # Ok now that we have everything to process a pdf document, lets process it page by page
                for page in PDFPage.create_pages(document):
                    # As the interpreter processes the page stored in PDFDocument object
                    interpreter.process_page(page)
                    # The device renders the layout from interpreter
                    layout = device.get_result()
                    # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
                    for lt_obj in layout:
                        if isinstance(lt_obj, LTTextBox) or isinstance(
                                lt_obj, LTTextLine):
                            extracted_text += lt_obj.get_text()

                # close the pdf file
                fp.close()
                """
          data = extracted_text.encode("utf-8").lower()
          read1 = data.split("q")

          btperq = []
          for i in range(1, len(read1)):
              btlevellist = []
              read1[i] = read1[i].translate(None, digits)
              read1[i] = re.sub('[.,!?]', '', read1[i])
              t = read1[i].split(" ")
              for word in range(len(t)):
                  for values in bt.values():
                      for keywords in values:
                          if (t[word] == keywords):
                              btlevellist.append(bt.keys()[bt.values().index(values)])
              btperq.append(btlevellist)
          senddata = {'question': read1, 'btlevel': btperq, 'list': zip(read1, btperq)}
          return Response(senddata, template_name='file.html')
           """
                return HttpResponse(nextracted_text.encode("utf-8"))

                #return Response("it is pdf")
            #response = HttpResponse(FilePointer)
            #response['Content-Disposition'] = 'attachment; filename=NameOfFile'
            #return response
            elif (t[1] == "jpg"):

                image = cv2.imread(file_path)
                gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

                # check to see if we should apply thresholding to preprocess the
                # image
                #if args["preprocess"] == "thresh":
                gray = cv2.threshold(gray, 0, 255,
                                     cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

                # make a check to see if median blurring should be done to remove
                # noise
                # elif args["preprocess"] == "blur":
                #gray = cv2.medianBlur(gray, 3)

                # write the grayscale image to disk as a temporary file so we can
                # apply OCR to it
                filename = "{}.png".format(os.getpid())
                cv2.imwrite(filename, gray)

                # load the image as a PIL/Pillow image, apply OCR, and then delete
                # the temporary file
                text = pytesseract.image_to_string(Image.open(filename))
                os.remove(filename)
                return HttpResponse(text)
            #return Response(file_serializer.data, status=status.HTTP_201_CREATED)
            else:
                bt1 = [
                    'define', 'describe', 'draw', 'find', 'identify', 'label',
                    'list', 'locate', 'match', 'memorise', 'name', 'recall',
                    'recite', 'recognize', 'relate', 'reproduce', 'select',
                    'state', 'tell', 'write'
                ]
                bt2 = [
                    'compare', 'convert', 'demonstarte', 'describe', 'discuss',
                    'distinguish', 'explain',
                    'find out more information about', 'generalize',
                    'interpret', 'outline', 'paraphrase', 'predict',
                    'put into your own words', 'relate', 'restate',
                    'summarize', 'translate', 'visualize'
                ]
                bt3 = [
                    'apply', 'calculate', 'change', 'choose', 'complete',
                    'construct', 'examine', 'illustrate', 'interpret', 'make',
                    'manipulate', 'modify', 'produce', 'put into practice',
                    'put together', 'solve', 'show', 'translate', 'use'
                ]
                bt4 = [
                    'advertise', 'analyse', 'categoriase', 'compare',
                    'contrast', 'deduce', 'differenciate', 'distinguish',
                    'examine', 'explain', 'identify', 'investigate',
                    'seperate', 'subdivide', 'take apart'
                ]
                bt5 = [
                    'argue', 'assess', 'choose', 'compose', 'construct',
                    'create', 'criticise', 'critique', 'debate', 'decide',
                    'defend', 'design', 'determine', 'device', 'discuss',
                    'estimate', 'evaluate', 'formulate', 'imagine', 'invent',
                    'judge', 'justify', 'plan', 'predict', 'prioritise',
                    'propose', 'rate', 'recommend', 'select', 'value'
                ]
                bt6 = [
                    'add to', 'argue', 'assess', 'choose', 'combine',
                    'compose', 'construct', 'create', 'debate', 'decide',
                    'design', 'determine', 'devise', 'discuss', 'forcast',
                    'formulate', 'hypothesise', 'imagine', 'invent', 'judge',
                    'justify', 'originate', 'plan', 'predict', 'priortise',
                    'propose', 'rate', 'recommend', 'select', 'verify'
                ]
                bt = {
                    'bt1': bt1,
                    'bt2': bt2,
                    'bt3': bt3,
                    'bt4': bt4,
                    'bt5': bt5,
                    'bt6': bt6
                }
                data = FilePointer.read()
                data = data.lower()
                read1 = data.split("q")
                btperq = []
                for i in range(1, len(read1)):
                    btlevellist = []
                    read1[i] = read1[i].translate(None, digits)
                    read1[i] = re.sub('[.,!?]', '', read1[i])
                    t = read1[i].split(" ")
                    for word in range(len(t)):
                        for values in bt.values():
                            for keywords in values:
                                if (t[word] == keywords):
                                    btlevellist.append(
                                        bt.keys()[bt.values().index(values)])
                    btperq.append(btlevellist)
                senddata = {
                    'question': read1,
                    'btlevel': btperq,
                    'list': zip(read1, btperq)
                }
                return Response(senddata, template_name='file.html')

        else:
            return Response(file_serializer.errors,
                            status=status.HTTP_400_BAD_REQUEST)

示例#25

0

显示文件

文件： model_utilsv0.py 项目： joiEspinoza/rose-respaldo

def create_candidates(path, sel_id, min_req, desire_req):
    """
    We start by getting text from resumes uploaded to AWS S3 bucket rosev0
    In case of pdf and docx we were able to process bytes making it easier
    to handle and make an inference. 
    On the other hand, we have a processed doc files with antiword, 
    sometimes the doc wiull not be processed in case of having weird format

    AWS 
        S3
            bucket: rosev0
                mail_user / replace @ with _
                    chilean date and selection name

    """
    #connecting to AWS S3
    s3 = boto3.resource("s3",
                        region_name='us-east-2',
                        aws_access_key_id=os.environ.get('AWS_KEY'),
                        aws_secret_access_key=os.environ.get('AWS_SECRET'))
    # loading models
    work_nlp = spacy.load('selection/models/work')
    ed_nlp = spacy.load('selection/models/education')
    per_nlp = spacy.load('selection/models/personal')
    rose_bucket = s3.Bucket(r'rosev0')
    candidates = []
    for resume in rose_bucket.objects.filter(Prefix=path):
        key = resume.key
        body = resume.get()['Body'].read()
        buffer = io.BytesIO()
        buffer.write(body)
        ext = re.search('\.[a-z]+$', key)
        print(key)
        ###body comes in binary stream, we have to decode it
        if ext == None:
            continue
        elif ext.group() == '.docx':
            document = Document(buffer)
            text = "\n".join(
                [paragraph.text for paragraph in document.paragraphs])
            #print(string)
        elif ext.group() == '.pdf':
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(rsrcmgr,
                                   retstr,
                                   codec=codec,
                                   laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            for page in PDFPage.get_pages(buffer,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)
                #fp.close()
                #device.close()
                #retstr.close()
            text = retstr.getvalue()
            #print(string)
        elif ext.group() == '.doc':
            #LINUX version handles damaged files and text in docs
            split = str(key).split('/')
            #replace special characters, linux problem reading path
            filename = str(split[-1]).replace('$', '_').replace('#', '_')
            pathdoc = 'selection/tmp/' + filename
            print('trying download in ' + pathdoc)
            rose_bucket.download_file(key, pathdoc)
            #doc_text = os.system('antiword "' + pathdoc + '"')
            try:
                output = subprocess.check_output('antiword "' + pathdoc + '"',
                                                 shell=True)
                text = output.decode('utf-8')
            except:
                continue

        if text != None:
            doc_work = work_nlp(text)
            doc_ed = ed_nlp(text)
            doc_per = per_nlp(text)
            d = {}
            results = []
            # first we create list for work classes, we need to procces them, get them together
            comp_work = []
            desig_work = []
            years_work = []
            other_work = []
            desig_ind = []
            years_ind = []
            idioms = []
            skills = []
            comb = []
            # then we do the same with college attributes
            grad_ed = []
            colleges = []
            degrees = []
            certif = []
            # then personal
            names = []
            locations = []
            mails = []
            phones = []

            #create array with entity text from algorithm inference
            for ent in doc_work.ents:
                for value in [ent.text]:
                    if ent.label_ == 'companies worked at':
                        comp_work.append(value)
                    elif ent.label_ == 'designation':
                        desig_work.append(value)
                        desig_ind.append(ent.start_char)
                    elif ent.label_ == 'years of experience':
                        years_work.append(value)
                        years_ind.append(ent.start_char)
                    elif ent.label_ == 'idioms':
                        idioms.append(value)
                    elif ent.label_ == 'skills':
                        skills.append(value)
                    else:
                        other_work.append([ent.label_, value])
                    results.append([ent.label_, value, text.index(value)])
            for ent in doc_ed.ents:
                for value in [ent.text]:
                    if ent.label_ == 'graduation year':
                        grad_ed.append(value)
                    elif ent.label_ == 'college':
                        colleges.append(value)
                    elif ent.label_ == 'degree':
                        degrees.append(value)
                    elif ent.label_ == 'certifications':
                        certif.append(value)
                    results.append([ent.label_, value, text.index(value)])
            for ent in doc_per.ents:
                for value in [ent.text]:
                    if ent.label_ == 'name':
                        names.append(value)
                    elif ent.label_ == 'location':
                        locations.append(value)
                    elif ent.label_ == 'mail':
                        mails.append(value)
                    elif ent.label_ == 'phone':
                        phones.append(value)
                    results.append([ent.label_, value, text.index(value)])

            #Creating list of indexes, we have different list formats, a little messy

            #Why? because year of expereince is involved in the sequence target
            ind = 0
            #desig_ind = [text.index(x[1]) for x in desig_work]
            #years_ind = [text.index(x[1]) for x in years_work]
            print(
                'Validating same lenght of ind and entities, desig_work_years: '
                + str(len(desig_work)) + ' ' + str(len(comp_work)) + ' ' +
                str(len(years_work)))

            ### SET YEARS OF EXPERIENCE -- how many years have passed since the first job experience?
            working_years = re.findall("[0-9]{4}", str(years_work))
            if_current = [
                value.replace("'", "")
                for value in re.findall("([a-zA-Z]+')", str(years_work))
                if value != "experience'"
            ]
            val_current = 0
            for value in if_current:
                for value2 in current:
                    match = SequenceMatcher(None, value.lower(),
                                            value2).ratio()
                    if match > 0.8:
                        val_current = val_current + 1
            if len(working_years) != 0:
                year_min = min(working_years)
                if val_current > 0:
                    year_max = datetime.now().strftime('%Y')
                else:
                    year_max = max(working_years)

            experience = int(year_max) - int(year_min)

            ### SETTING IDIOMS - we remove word idioms and create list with idioms
            c_idioms = [
                value for value in idioms
                if SequenceMatcher(None, value.lower(), 'idioma').ratio() < 0.8
            ]
            print(c_idioms)

            ### SETTING TYPE OF DEGREE - is is tech, profesional or postgraduate
            pro_type = []
            if 'técnic' in str(degrees).lower() or 'tecnic' in str(
                    degrees).lower():
                pro_type.append('Técnico')
            elif 'msc' in str(degrees).lower() or 'master' in str(
                    degrees
            ).lower() or 'magister' in str(degrees).lower().replace(
                    'í', 'i') or 'maestría' in str(degrees).lower().replace(
                        'í', 'i'):
                pro_type.append('Master')
            elif 'phd' in str(degrees).lower() or 'doctor' in str(
                    degrees).lower() or 'doctorado' in str(degrees).lower():
                pro_type.append('Doctorado')
            elif 'universidad' in str(degrees).lower():
                pro_type.append('Profesional')
            else:
                pro_type.append('-')

            ### SETTING LOCATION INDICATOR - is the candidate inside the minimum region
            if len[min_locs] > 0:
                if len(locations) > 0:
                    location = []
                    if len(locations
                           ) >= 2 and locations[1] not in locations[0]:
                        adress_raw = locations[0] + ' ' + locations[1]
                    else:
                        adress_raw = locations[0]
                    for car in googlemaps_reserved:
                        adress = str(adress_raw).replace(car, '')
                    endpoint = 'https://maps.googleapis.com/maps/api/geocode/json?address=' + adress.replace(
                        ' ', '+') + '&key=' + os.environ.get('MAPS_KEY')
                    try:
                        get_location = requests.get(
                            endpoint).json()['results'][0]
                        for comp in get_location['address_components']:
                            if comp['types'][
                                    0] == 'administrative_area_level_1':
                                location.append(comp['long_name'])
                            if comp['types'][0] == 'country':
                                location.append(comp['long_name'])
                    except:
                        loc_ind = 2
                        continue
                    for loc in min_locs:
                        min_loc = []
                        endpoint2 = 'https://maps.googleapis.com/maps/api/geocode/json?address=' + loc.replace(
                            ' ', '+') + '&key=' + os.environ.get('MAPS_KEY')
                        get_minloc = requests.get(
                            endpoint2).json()['results'][0]
                        for comp in get_minloc['address_components']:
                            if comp['types'][
                                    0] == 'administrative_area_level_1':
                                min_loc.append(comp['long_name'])
                            if comp['types'][0] == 'country':
                                min_loc.append(comp['long_name'])
                        if len(min_loc) == 1:
                            state_country = location[1] == min_loc[0]
                            loc_ind = 1 if state_country == True else 0
                        elif len(min_loc) == 2:
                            state_country = location[1] == min_loc[1]
                            state_region = location[0] == min_loc[0]
                            loc_ind = 1 if state_country == True and state_region == True else 0
                else:
                    loc_ind = 2
            else:
                loc_ind = 0

            ### SETTING SKILLS AND CERTIFICATION SCORE - high score

            cand_data = {
                "exp": experience,
                "type": pro_type,
                "idioms": c_idioms,
                "skills": skills,
                "location": locations,
                "companies": comp_work,
                "phone": phones,
                "mail": mails,
                "designation": desig_work,
                "college": colleges,
                "graduation": grad_ed,
                "certficiations": certif
            }
            ### ADD RANKING HERE
            cand_info = {"data": cand_data, "rank": random.randint(1, 100)}
            cand = {
                "name": names[0] if len(names) > 0 else 'Desconocido',
                "mail": mails[0] if len(mails) > 0 else None,
                "info": cand_info,
                "selection": sel_id,
            }
            candidates.append(cand)
    #print(candidates)
    #headers = {'Content-Type': 'application/json'}
    #post_candidates = requests.post('http://localhost:8000/selection/create_candidate/', headers = headers, data=json.dumps(candidates))
    #print('RESPONSE TEXT: ', post_candidates.text)
    return candidates

示例#26

0

显示文件

    def extract_text_from_pdf(self, way='pdfminer', outputType="text"):
        """
        ----------
        Function
        ----------
        * Opens PDF page by page and stores text inside text object
        --------
        INPUT
        --------
        pdf_path : path to the pdf
        way : library to extract text from pdf
        outputType : (default) Text
                     "text": (default) plain text with line breaks. No formatting, no text position details, no images.
                    "html": creates a full visual version of the page including any images.
                            This can be displayed with your internet browser.
                    "dict": same information level as HTML, but provided as a Python dictionary.
                            See TextPage.extractDICT() for details of its structure.
                    "rawdict": a super-set of TextPage.extractDICT().
                            It additionally provides character detail information like XML. See TextPage.extractRAWDICT() for details of its structure.
                    "xhtml": text information level as the TEXT version but includes images.
                            Can also be displayed by internet browsers.
                    "xml": contains no images, but full position and font information down to each single text character.
                            Use an XML module to interpret.


        -------
        RETURN
        -------
        if pdfminer:
            text : (str) raw text object
        if fitz
            textDataList : (list) List of text inside the pdf with size equals number of pages and index signifying page number.
        """

        """ Converting PDF to Text"""
        logging.info('Inside extract_text_from_pdf')
        if way == 'pdfminer':
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)

            with open(self.path, 'rb') as fh:
                for page in PDFPage.get_pages(fh,
                                              caching=True,
                                              check_extractable=True):
                    page_interpreter.process_page(page)

                text = fake_file_handle.getvalue()

            converter.close()
            fake_file_handle.close()

            # if the text is non zero
            if text:
                logging.info('Successfully extracted text - Exiting extract_text_from_pdf')
                return text

        if way == 'fitz':
            Pdfdocument = fitz.open(self.path)
            textDataList = []
            for pagenumber in range(Pdfdocument.pageCount):
                page = Pdfdocument[pagenumber]
                textDataList.append(page.getText(outputType))
            return textDataList

        logging.info('PDF was not readable - Exiting extract_text_from_pdf')

示例#27

0

显示文件

文件： test.py 项目： dunyuling/python_base

doc = PDFDocument()

# Connect the parser and document objects.
parser.set_document(doc)
doc.set_parser(parser)

# Supply the password for initialization.
# (If no password is set, give an empty string.)
password = ''
doc.initialize(password)

# Check if the document allows text extraction. If not, abort.
if not doc.is_extractable:
    raise PDFTextExtractionNotAllowed

# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in doc.get_pages():
    interpreter.process_page(page)
    layout = device.get_result()
    for element in layout:
        if isinstance(element, LTTextBoxHorizontal):
            print(element.get_text())

示例#28

0

显示文件

文件： pdf2txt.py 项目： zhaoyun0071/REKCARC-TSC-UHT

def main(argv):
    import getopt

    def usage():
        print(
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
            ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
            ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]'
            ' [-t text|html|xml|tag] [-c codec] [-s scale]'
            ' file ...' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-S': stripcontrol = True
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return

示例#29

0

显示文件

def extract_pdf_data(fp, test_proportion=0, labels={}, session=None):
    """ Get PDF data from a file.

    TODO why is this a standalone function?
    :param fp: A file pointer to the PDF.
    :param labels: Correct metadata labels for this document.
    :param session: A SQLAlchemy session, for saving.
    :return: A Document object.
    """

    filename = os.path.split(fp.name)[-1]
    parser = PDFParser(fp)
    pdf = PDFDocument(parser)
    parser.set_document(pdf)

    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    pages = PDFPage.create_pages(pdf)
    document = Document(filename=filename, is_test=random() < test_proportion)
    for key in labels:
        setattr(document, key, labels[key])
    if session:
        session.add(document)

    for i, page in enumerate(pages):
        # TODO: figure out how to get the number of pages directly from pages object
        document.num_pages = i + 1
        interpreter.process_page(page)
        layout = device.get_result()
        boxes = [obj for obj in layout if isinstance(obj, LTTextBox)]
        for b in boxes:
            box = Box(document=document,
                      page=i,
                      x0=b.bbox[0],
                      y0=b.bbox[1],
                      x1=b.bbox[2],
                      y1=b.bbox[3],
                      vertical=isinstance(b, LTTextBoxVertical))
            if session:
                session.add(box)
            lines = [obj for obj in b if isinstance(obj, LTTextLine)]
            for l in lines:
                text = re.sub(r'\(cid:\d+\)', "", l.get_text()).strip()
                if len(text) > 0:
                    vertical = isinstance(l, LTTextLineVertical)
                    line = Line(box=box,
                                document=document,
                                x0=l.bbox[0],
                                y0=l.bbox[1],
                                x1=l.bbox[2],
                                y1=l.bbox[3],
                                text=text,
                                vertical=vertical,
                                page=i)
                    if session:
                        session.add(line)

    # do the whole file on one transaction so we can restart
    # easily if necessary
    if session:
        session.commit()
    return document

示例#30

0

显示文件

文件： extract_mails.py 项目： iurantr/mailextractor

def pdf_to_str(pdf_filepath):
    """Returns the contents of pdf as a string."""
    
    # Code is taken and modified from:
    # https://gist.github.com/vinovator/c78c2cb63d62fdd9fb67
    
    # pdfTextMiner.py
    # Python 2.7.6
    # For Python 3.x use pdfminer3k module
    # This link has useful information on components of the program
    # https://euske.github.io/pdfminer/programming.html
    # http://denis.papathanasiou.org/posts/2010.08.04.post.html
    
    ''' This is what we are trying to do:
    1) Transfer information from PDF file to PDF document object. This is done using parser
    2) Open the PDF file
    3) Parse the file using PDFParser object
    4) Assign the parsed content to PDFDocument object
    5) Now the information in this PDFDocumet object has to be processed. For this we need
       PDFPageInterpreter, PDFDevice and PDFResourceManager
     6) Finally process the file page by page 
    '''
    
#    my_file = os.path.join("./" + pdf_filepath)
    
    password = ""
    extracted_text = ""
    
    # Open and read the pdf file in binary mode
    fp = open(pdf_filepath, "rb")
    
    # Create parser object to parse the pdf content
    parser = PDFParser(fp)
    
    # Store the parsed content in PDFDocument object
    document = PDFDocument(parser, password)
    
    # Check if document is extractable, if not abort
    if not document.is_extractable:
    	raise PDFTextExtractionNotAllowed
    	
    # Create PDFResourceManager object that stores shared resources such as fonts or images
    rsrcmgr = PDFResourceManager()
    
    # set parameters for analysis
    laparams = LAParams()
    
    # Create a PDFDevice object which translates interpreted information into desired format
    # Device needs to be connected to resource manager to store shared resources
    # device = PDFDevice(rsrcmgr)
    # Extract the decive to page aggregator to get LT object elements
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    
    # Create interpreter object to process page content from PDFDocument
    # Interpreter needs to be connected to resource manager for shared resources and device 
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Ok now that we have everything to process a pdf document, lets process it page by page
    
    for page in PDFPage.create_pages(document):
    	# As the interpreter processes the page stored in PDFDocument object
    	interpreter.process_page(page)
    	# The device renders the layout from interpreter
    	layout = device.get_result()
    	# Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
    	for lt_obj in layout:
    		if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
    			extracted_text += lt_obj.get_text()
    			
    #close the pdf file
    fp.close()
    
    # print (extracted_text.encode("utf-8"))
    return extracted_text