def process_pdf_internal(fname, page_num='all'): """ Change from orignal name of process_pdf to process_pdf_internal get the raw character :param fname: :param page_num: :return: """ tmp_path = get_tmp_path(fname) cache_path = "%s.chars.%s.pkl"%(tmp_path, str(page_num)) if os.path.isfile(cache_path): try: return pickle.load(open(cache_path)) except Exception as e: print "load failed, get again" # global char_list char_list = [] if debug: print fname # Open a PDF file. fp = open(fname, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for i, page in enumerate(PDFPage.create_pages(document)): process_mark = (page_num == 'all' or page_num == i) if process_mark: interpreter.process_page(page) layout = device.get_result() print_layout(layout, char_list) if page_num == i: break crop_bbox = get_pdf_page_bbox_abandon(fname, page_num) for char in char_list: if isinstance(char, LTChar): adjust_element_bbox(char, crop_bbox) with open(cache_path, 'w') as f: pickle.dump(char_list, f) return char_list
def export_glyph_ratio(pdf_path): """ TODO :param pdf_path: :return: """ raise Exception("should not call it") tmp_pdf_path = get_tmp_path(pdf_path) print "TODO, move the glyph JAR into common place" export_glyph_jar_path = "E:/pdfbox-2.0.8-src/pdfbox-2.0.8/debugger/target/pdfGlyphAdjust-jar-with-dependencies.jar" cmd = "java -jar {} {}".format(export_glyph_jar_path, tmp_pdf_path) os.system(cmd)
def get_glyph_ratio(pdf_path, pid): """ TODO, what the return should be like? :param pdf_path: :param pid: :return: page to fontname 2 glyphname 2 pair/tuple """ raise Exception("Should not call it") import shutil from pdfxml.path_util import get_tmp_path from pdfxml.pdf_util.pdf_extract import get_page_num tmp_pdf_path = get_tmp_path(pdf_path) if tmp_pdf_path == pdf_path or os.path.isfile(tmp_pdf_path): pass else: shutil.copy(pdf_path, tmp_pdf_path) pn = get_page_num(pdf_path) all_create = True # check whether all created for i in range(pn): gr_path = "{}.glyphratio.{}.txt".format(tmp_pdf_path, i) if not os.path.isfile(gr_path): all_create = False break if not all_create: export_glyph_ratio(pdf_path) # read from the files and return here #page2fontname2glyphname2adjust = {} #for pid in range(pn): fontname2glyphname2adjust = {} gr_path = "{}.glyphratio.{}.txt".format(tmp_pdf_path, pid) lines = open(gr_path).readlines() for line in lines: line = line.strip() if line == "": continue ws = line.strip().split("\t") fontname, glyphname, up_ratio, lower_ratio = \ ws[0], ws[1], float(ws[2]), float(ws[3]) if fontname not in fontname2glyphname2adjust: fontname2glyphname2adjust[fontname] = {} fontname2glyphname2adjust[fontname][glyphname] = (up_ratio, lower_ratio) #page2fontname2glyphname2adjust[pid] = fontname2glyphname2adjust #return page2fontname2glyphname2adjust return fontname2glyphname2adjust
def get_page_num(fpath): """ Get the page number for the current pdf file https://stackoverflow.com/questions/45841012/how-can-i-get-the-total-count-of-total-pages-of-a-pdf-using-pdfminer-in-python """ tmp_path = get_tmp_path(fpath) cache_path = "{}.page_num.json".format(tmp_path) if os.path.isfile(cache_path): tmp_dict = load_general(cache_path) return tmp_dict['page_num'] # Open a PDF file. fp = open(fpath, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) c = resolve1(document.catalog['Pages'])['Count'] tmp_dict = {'page_num': c} dump_general(tmp_dict, cache_path) return c
def get_font_from_pdf(pdf_path, pid): """ This is the most commonly used file NOTE, using the pdfbox to export the glyph :param pdf_path: :return: """ return None tmp_path = get_tmp_path(pdf_path) font_path = "{}.font".format(tmp_path) if not os.path.isfile(font_path): export_font(pdf_path, font_path) page2name2detail = read_pdfbox_font(font_path) return page2name2detail[pid] ############# # using PDFBox to read all char information ############# #def export_font(pdf_path, export_path): """
def process_pdf_lines(fname, page_num='all', do_adjust=False): """ :param fname: file path to the PDF file :param page_num: default to extract all :return: :rtype: list(list(LTChar)) """ # TODO, cache the informatin here? from pdfxml.path_util import get_tmp_path tmp_pdf_path = get_tmp_path(fname) pdf_lines_cache = "{}.pdf_line.{}.pkl".format(tmp_pdf_path, page_num) if os.path.isfile(pdf_lines_cache): return load_serialization(pdf_lines_cache) line_list = [] char_list = [] def print_layout(l): """ get all the chars """ for e in l: if isinstance(e, LTTextLineHorizontal): #print "try recursively text line" print_layout(e) line_list.append(copy.copy(char_list)) while len(char_list) > 0: char_list.pop() if isinstance(e, LTTextBoxHorizontal): #print "try recursively text box" print_layout(e) if isinstance(e, LTChar) or isinstance(e, LTAnno): char_list.append(e) fp = open(fname, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for i, page in enumerate(PDFPage.create_pages(document)): process_mark = (page_num == 'all' or page_num == i) if process_mark: interpreter.process_page(page) layout = device.get_result() print_layout(layout) if page_num == i: break if do_adjust: for line in line_list: adjust_basedon_glyph_ratio(line, fname, page_num) # adjust based on crop bbox crop_bbox = get_pdf_page_bbox_abandon(fname, page_num) for line in line_list: for char in line: if isinstance(char, LTChar): adjust_element_bbox(char, crop_bbox) dump_serialization(line_list, pdf_lines_cache) return line_list
def pdf_extract_lines(pdf_path, pid=0, force_single=False): """ each line is a list of LTChar :param pdf_path: :param pid: :return: """ tmp_pdf_path = get_tmp_path(pdf_path) cache_path = "{}.pdfbox_merge_line.{}.pkl".format(tmp_pdf_path, pid) if os.path.isfile(cache_path): return load_serialization(cache_path) char_list_list = pdf_extract_lines_raw(pdf_path, pid) # TODO, do another round of line merging # still use our column line detection model to find the region. fontname2space = pdf_extract_fontname2space(pdf_path, pid) word_info_list = pdf_extract_words(pdf_path, pid) res_char_list_list = [] if not force_single and is_double_column(pdf_path, pid): # split the current list into three parts # detect the center split, create two column # outside of the double column, # within the double column page_size = get_pdf_page_size(pdf_path, pid) page_width = page_size['width'] out_char_list_list = [] left_char_list_list = [] right_char_list_list = [] from pdfxml.pdf_util.layout_util import get_char_list_bbox for char_list in char_list_list: bbox = get_char_list_bbox(char_list) if bbox.left() < bbox.right() < page_width / 2: left_char_list_list.append(char_list) elif bbox.right() > bbox.left() > page_width / 2: right_char_list_list.append(char_list) else: out_char_list_list.append(char_list) # before mering do the word_info_filter word_info_list = word_info_filter(char_list_list, word_info_list) new_out_char_list_list = merging_lines(out_char_list_list, fontname2space, word_info_list, pdf_path, pid) new_left_char_list_list = merging_lines(left_char_list_list, fontname2space, word_info_list, pdf_path, pid) new_right_char_list_list = merging_lines(right_char_list_list, fontname2space, word_info_list, pdf_path, pid) # not in the vertical range of the double dolumn # center on the left part, # center on the right part, char_list_list = [] char_list_list.extend(new_out_char_list_list) char_list_list.extend(new_left_char_list_list) char_list_list.extend(new_right_char_list_list) res_char_list_list = char_list_list else: # before mering do the word_info_filter word_info_list = word_info_filter(char_list_list, word_info_list) # single column, then just go on merging the lines new_char_list_list = merging_lines(char_list_list, fontname2space, word_info_list, pdf_path, pid) res_char_list_list = new_char_list_list dump_serialization(res_char_list_list, cache_path) return res_char_list_list
def extract_me(pdf_path): pdf_name = get_file_name_prefix(pdf_path) # TODO, place it outside duration_recorder.begin_timer("Begin ME Extraction") # load the setting here. tmp_pdf_path = get_tmp_path(pdf_path) if not os.path.isfile(tmp_pdf_path): shutil.copy(pdf_path, tmp_pdf_path) if extraction_done(pdf_path): print "ME extraction done for {}".format(pdf_path) return #if ext_settings.debug: # convert2image(pdf_path) # batch extraction of lines pn = get_page_num(pdf_path) if ext_settings.debug: # the font is not useful in later stage #get_font_from_pdf(pdf_path, 0) # just do it once, other wise, the parallel error? pass duration_recorder.begin_timer("Column-Line-Word") # This part should not be parallelized, only execute once export_exact_position(pdf_path) if ext_settings.ME_ANALYSIS_PARALLEL: print "parallized CLW threading" from pdfxml.pdf_util.clw_pipeline import clw_pdf_lines #Parallel(n_jobs=PARALLEL_SIZE)( # delayed(clw_pdf_lines)(pdf_path, pid) for pid in range(pn)) from multiprocessing import Process process_list = [] for pid in range(pn): p = Process(target=clw_pdf_lines, args=(pdf_path, pid)) p.start() process_list.append(p) for p in process_list: p.join() else: print "serialized CLW" if ext_settings.CLW_VERSION == CLW_OLD: from pdfxml.pdf_util.ppc_line_reunion import ppc_line_reunion for pid in range(pn): ppc_line_reunion(pdf_path, pid) elif ext_settings.CLW_VERSION == CLW_FEB: from pdfxml.pdf_util.clw_pipeline import clw_pdf_lines for pid in range(pn): clw_pdf_lines(pdf_path, pid) else: raise Exception("unknown version") duration_recorder.begin_timer("IME Extraction") extract_ime(pdf_path) duration_recorder.begin_timer("EME Extraction") extract_eme(pdf_path) duration_recorder.begin_timer("ME extraction finished")
def assess_ime(pdf_path, pid=0, xml_out_path=None, ignore_exist=False): """ # IME [3] With math symbol and without non-math words Return: xml_out_path: output the boundary file """ tmp_path = get_tmp_path(pdf_path) ret_info_dict = {} if xml_out_path and os.path.isfile(xml_out_path) and (not ignore_exist): return {} from pdfxml.me_extraction.me_consts import math_words t = time.time() # common resource loader wl = set(words.words()) wl.update(additional_words) wnl = WordNetLemmatizer() d = time.time() - t ret_info_dict['resource_time'] = d t = time.time() # layout analysis font = get_font_from_pdf(pdf_path, pid) #font = None prefix = pdf_path[pdf_path.rindex("/") + 1:-4] lines = internal_get_llines(prefix, pdf_path, pid) d = time.time() - t ret_info_dict['layout_time'] = d # IME assessment core t = time.time() line_labels = [0] * len(lines) for li, line in enumerate(lines): line_label = 0 beg_idx = 0 with_math_symbol_or_word = False with_non_math_word = False for i, char in enumerate(line): if isinstance(char, LTChar): if check_is_math_LTChar(char, font): me_extraction_logger.debug("Char {} as Math".format(char)) with_math_symbol_or_word = True if is_space_char(char): word = "" for j in range(beg_idx, i): if j == i - 1 and line[j].get_text() in [ ',', '.', 'period', 'comma' ]: continue # for word checking, only work on the alpha beta tmp_text = line[j].get_text() if len(tmp_text) != 1: tmp_text = " " word += tmp_text beg_idx = i + 1 word = word.lower().strip() # move to above, and use glyph name to match #if word.endswith(',') or word.endswith('.'): # word = word[:-1] # print check word s_word, v_word = "", "" try: s_word = wnl.lemmatize(word, 'n') v_word = word v_word = wnl.lemmatize(word, 'v') except Exception as e: me_extraction_error_logger.error( "Error checking the word as noun or verb") if word in math_words: me_extraction_logger.debug("Math Word {}".format(word)) with_math_symbol_or_word = True elif len(word) > 2 and (word in wl or s_word in wl or v_word in wl): me_extraction_logger.debug("Plain Word {}".format(word)) with_non_math_word = True else: pass # debug for line, with ME or not tmp_line_str = char_list2str(line, ', ') me_extraction_logger.debug(tmp_line_str) me_extraction_logger.debug("with math {}, with word {}".format( with_math_symbol_or_word, with_non_math_word)) if with_math_symbol_or_word and (not with_non_math_word): me_extraction_logger.debug("MATHLINE") line_label = 1 line_labels[li] = line_label d = time.time() - t ret_info_dict['core_time'] = d if not xml_out_path: for li, line in enumerate(lines): if line_labels[li]: tmp_str = ''.join([ char.get_text() for char in line if isinstance(char, LTChar) ]) print tmp_str.encode("utf-8") # export for evaluation page_info = {} page_info['pid'] = pid page_info['ilist'] = [] page_info['elist'] = [] # create bbox for each ME for li, line in enumerate(lines): if line_labels[li]: visible_char_list = [ char for char in line if isinstance(char, LTChar) ] char_list2str(visible_char_list) page_info['ilist'].append(line) t = time.time() if xml_out_path: export_xml(page_info, xml_out_path, pdf_path, pid) d = time.time() - t ret_info_dict['io_time'] = d return ret_info_dict