def export_as_csv(pdf_path, csv_path): filename = os.path.splitext(os.path.basename(pdf_path))[0] with open(csv_path, 'w', encoding="utf-8") as csv_file: writer = csv.writer(csv_file) return_data = [] go = 0 for page in extract_text_by_page(pdf_path): for letter in page: if letter == '.': go = 1 if go == 1 and letter != '.': if letter in [u"\u25B6"]: return_data.append(' TRIANGLE ') pattern = re.search(r'[a-zA-Z0-9]{1}', letter) if pattern: return_data.append(letter) else: if letter not in [u"\u25B6"]: s = letter.encode('raw_unicode_escape') if s in [b'\x0c']: return_data = ''.join(return_data) writer.writerow([return_data]) return_data = [] else: return_data.append(letter) writer.writerow([return_data])
def export_as_csv(pdf_path, csv_path): filename = os.path.splitext(os.path.basename(pdf_path))[0] counter = 1 with open(pdf_path, 'w') as csv_file: writer = csv.writer(csv_file) for page in extract_text_by_page(pdf_path): text = page[0:100] words = text.split() writer.writerow(words)
def export_as_json(pdf_path, json_path): filename = os.path.splitext(os.path.basename(pdf_path))[0] data = {'Filename': filename} data['Pages'] = [] counter = 1 for page in extract_text_by_page(pdf_path): text = page[0:100] page = {'Page_{}'.format(counter): text} data['Pages'].append(page) counter += 1 with open(json_path, 'w') as fh: json.dump(data, fh)
def export_as_json(pdf_path, json_path): filename = os.path.splitext(os.path.basename(pdf_path))[0] data = {'Filename': filename} data['Pages'] = [] counter = 1 for page in extract_text_by_page(pdf_path): text = re.sub("\x0c",' ',page) # 连字符深恶痛绝! text = text.replace('fi','fi').replace('fl','fl').replace('ff','ff').replace('ffi','ffi').replace('ffl','ffl') page = {'Page_number':counter ,'text': text.lower()} data['Pages'].append(page) counter += 1 with open(json_path, 'w') as fh: json.dump(data, fh)
def export_as_csv(pdf_path, csv_path, title_array): filename = os.path.splitext(os.path.basename(pdf_path))[0] row_length = len(title_array) with open(csv_path, 'w') as csv_file: writer = csv.writer(csv_file) writer.writerow(title_array) for page in extract_text_by_page(pdf_path): text = page.replace('\u25cf', ',') words = text.split() array = [] for key, word in enumerate(words): key = key + 1 if key % row_length == 0: array.append(word) writer.writerow(array) array = [] if key % row_length != 0: array.append(word)
def export_as_xml(pdf_path, xml_path): filename = os.path.splitext(os.path.basename(pdf_path))[0] root = xml.Element('{filename}'.format(filename=filename)) pages = xml.Element('Pages') root.append(pages) counter = 1 for page in extract_text_by_page(pdf_path): text = xml.SubElement(pages, 'Page_{}'.format(counter)) text.text = page[0:100] counter += 1 tree = xml.ElementTree(root) xml_string = xml.tostring(root, 'utf-8') parsed_string = minidom.parseString(xml_string) pretty_string = parsed_string.toprettyxml(indent=' ') with open(xml_path, 'w') as fh: fh.write(pretty_string)
def export_as_json(pdf_path): filename = os.path.splitext(os.path.basename(pdf_path))[0] data = {'filename': filename, 'number': None} data['pages'] = [] doc_content_correct = False doc_number = None counter = 1 for page in extract_text_by_page(pdf_path): parsed_page = parse_page(page) if get_doc_content_correct(parsed_page): doc_content_correct = True doc_number = get_doc_number(parsed_page) content = parsed_page if counter > 3: content = parsed_page.split('Взам. Инв. № 3434') # if content[1]: # content = content[1] d = { 'page': counter, 'content': content, } data['pages'].append(d) counter += 1 data['number'] = doc_number if doc_content_correct == False: # print("ОШИБКА - Нет или неправильная страница 'Содержание'") data['errors'] = [ "Содержание тома; Лист: 4. Основная надпись: графа (1) - шифр должен соответствовать шифру, указанному на тит. листе, и добавляется через дефис прописная буква С" ] # with open(json_path, 'w', encoding="utf-8") as fh: # json.dump(data, fh, ensure_ascii=False) return data
def export_as_xml(pdf_path, xml_path): arquivo = easygui.fileopenbox() pdf_path = arquivo filename_w_ext = os.path.basename(arquivo) filename, file_extension = os.path.splitext(filename_w_ext) saida = (filename + '_Corrigido.xml') filename = os.path.splitext(os.path.basename(pdf_path))[0] root = xml.Element('{filename}'.format(filename=filename)) pages = xml.Element('Pages') root.append(pages) counter = 1 for page in extract_text_by_page(pdf_path): text = xml.SubElement(pages, 'Page_{}'.format(counter)) text.text = page[0:100] counter += 1 tree = xml.ElementTree(root) xml_string = xml.tostring(root, 'utf-8') parsed_string = minidom.parseString(xml_string) pretty_string = parsed_string.toprettyxml(indent=' ') with open(xml_path, 'w') as fh: fh.write(pretty_string)
def extract_text(pdf_path): for page in extract_text_by_page(pdf_path): print(page) print()