Exemplo n.º 1
0
def export_as_csv(pdf_path, csv_path):
    filename = os.path.splitext(os.path.basename(pdf_path))[0]
    with open(csv_path, 'w', encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)
        return_data = []
        go = 0
        for page in extract_text_by_page(pdf_path):
            for letter in page:
                if letter == '.':
                    go = 1
                if go == 1 and letter != '.':
                    if letter in [u"\u25B6"]:
                        return_data.append(' TRIANGLE ')
                    pattern = re.search(r'[a-zA-Z0-9]{1}', letter)
                    if pattern:
                        return_data.append(letter)
                    else:
                        if letter not in [u"\u25B6"]:
                            s = letter.encode('raw_unicode_escape')
                            if s in [b'\x0c']:
                                return_data = ''.join(return_data)
                                writer.writerow([return_data])
                                return_data = []
                            else:
                                return_data.append(letter)

        writer.writerow([return_data])
Exemplo n.º 2
0
def export_as_csv(pdf_path, csv_path):
    filename = os.path.splitext(os.path.basename(pdf_path))[0]
    counter = 1
    with open(pdf_path, 'w') as csv_file:
        writer = csv.writer(csv_file)
        for page in extract_text_by_page(pdf_path):
            text = page[0:100]
            words = text.split()
            writer.writerow(words)
Exemplo n.º 3
0
def export_as_json(pdf_path, json_path):
    filename = os.path.splitext(os.path.basename(pdf_path))[0]
    data = {'Filename': filename}
    data['Pages'] = []
    counter = 1
    for page in extract_text_by_page(pdf_path):
        text = page[0:100]
        page = {'Page_{}'.format(counter): text}
        data['Pages'].append(page)
        counter += 1
    with open(json_path, 'w') as fh:
        json.dump(data, fh)
Exemplo n.º 4
0
def export_as_json(pdf_path, json_path):
    filename = os.path.splitext(os.path.basename(pdf_path))[0]
    data = {'Filename': filename}
    data['Pages'] = []
 
    counter = 1
    for page in extract_text_by_page(pdf_path):
        text = re.sub("\x0c",' ',page)
        # 连字符深恶痛绝!
        text = text.replace('fi','fi').replace('fl','fl').replace('ff','ff').replace('ffi','ffi').replace('ffl','ffl')
        page = {'Page_number':counter ,'text': text.lower()}
        data['Pages'].append(page)
        counter += 1
 
    with open(json_path, 'w') as fh:
        json.dump(data, fh)
Exemplo n.º 5
0
def export_as_csv(pdf_path, csv_path, title_array):
    filename = os.path.splitext(os.path.basename(pdf_path))[0]
    row_length = len(title_array)
    with open(csv_path, 'w') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(title_array)
        for page in extract_text_by_page(pdf_path):
            text = page.replace('\u25cf', ',')
            words = text.split()
            array = []
            for key, word in enumerate(words):
                key = key + 1
                if key % row_length == 0:
                    array.append(word)
                    writer.writerow(array)
                    array = []
                if key % row_length != 0:
                    array.append(word)
Exemplo n.º 6
0
def export_as_xml(pdf_path, xml_path):
    filename = os.path.splitext(os.path.basename(pdf_path))[0]
    root = xml.Element('{filename}'.format(filename=filename))
    pages = xml.Element('Pages')
    root.append(pages)

    counter = 1
    for page in extract_text_by_page(pdf_path):
        text = xml.SubElement(pages, 'Page_{}'.format(counter))
        text.text = page[0:100]
        counter += 1

    tree = xml.ElementTree(root)
    xml_string = xml.tostring(root, 'utf-8')
    parsed_string = minidom.parseString(xml_string)
    pretty_string = parsed_string.toprettyxml(indent='  ')

    with open(xml_path, 'w') as fh:
        fh.write(pretty_string)
Exemplo n.º 7
0
def export_as_json(pdf_path):
    filename = os.path.splitext(os.path.basename(pdf_path))[0]
    data = {'filename': filename, 'number': None}
    data['pages'] = []
    doc_content_correct = False
    doc_number = None

    counter = 1
    for page in extract_text_by_page(pdf_path):
        parsed_page = parse_page(page)
        if get_doc_content_correct(parsed_page):
            doc_content_correct = True

        doc_number = get_doc_number(parsed_page)

        content = parsed_page

        if counter > 3:
            content = parsed_page.split('Взам. Инв. № 3434')
            # if content[1]:
            #     content = content[1]

        d = {
            'page': counter,
            'content': content,
        }

        data['pages'].append(d)
        counter += 1

    data['number'] = doc_number

    if doc_content_correct == False:
        # print("ОШИБКА - Нет или неправильная страница 'Содержание'")
        data['errors'] = [
            "Содержание тома; Лист: 4. Основная надпись: графа (1) - шифр должен соответствовать шифру, указанному на тит. листе, и добавляется через дефис прописная буква С"
        ]

    # with open(json_path, 'w', encoding="utf-8") as fh:
    #     json.dump(data, fh, ensure_ascii=False)
    return data
Exemplo n.º 8
0
def export_as_xml(pdf_path, xml_path):
    arquivo = easygui.fileopenbox()
    pdf_path = arquivo
    filename_w_ext = os.path.basename(arquivo)
    filename, file_extension = os.path.splitext(filename_w_ext)
    saida = (filename + '_Corrigido.xml')
    filename = os.path.splitext(os.path.basename(pdf_path))[0]
    root = xml.Element('{filename}'.format(filename=filename))
    pages = xml.Element('Pages')
    root.append(pages)

    counter = 1
    for page in extract_text_by_page(pdf_path):
        text = xml.SubElement(pages, 'Page_{}'.format(counter))
        text.text = page[0:100]
        counter += 1

    tree = xml.ElementTree(root)
    xml_string = xml.tostring(root, 'utf-8')
    parsed_string = minidom.parseString(xml_string)
    pretty_string = parsed_string.toprettyxml(indent='  ')

    with open(xml_path, 'w') as fh:
        fh.write(pretty_string)
Exemplo n.º 9
0
def extract_text(pdf_path):
    for page in extract_text_by_page(pdf_path):
        print(page)
        print()