def extract_ime(pdf_path): # IME [1] """ given a pdf path, extract the ime for each page """ xml_path = get_xml_path(pdf_path) pn = get_page_num(pdf_path) if not ext_settings.ME_ANALYSIS_PARALLEL: for pid in range(pn): # prepare the path xml_out_path = get_ime_path(pdf_path, pid) if not os.path.isfile(xml_out_path): assess_ime(pdf_path, pid, xml_out_path) else: from multiprocessing import Process process_list = [] for pid in range(pn): p = Process(target=assess_ime, args=(pdf_path, pid, get_ime_path(pdf_path, pid))) process_list.append(p) p.start() for p in process_list: p.join()
def __init__(self, pdf_path, pid): """ :param pdf_path: :type pdf_path: str :param pid: :type pid: int """ pn = get_page_num(pdf_path) if pid >= pn: raise Exception("page size exceed") self.pdf_path = pdf_path self.pid = pid self.llines = [] # LayoutLine list char_list_list = internal_get_llines(None, pdf_path, pid) for char_list in char_list_list: line = LayoutLine(char_list, pid) self.llines.append(line) ime_bbox_list = load_ime_bbox_list(pdf_path, pid) for ll in self.llines: ":type ll:LayoutLine" if bbox_half_overlap_list(ll.get_bbox(), ime_bbox_list): ll.type = LINE_IME # TODO, check heading # TODO, customized for each conference if ll.check_line_section_title(): ll.type = LINE_TYPE_SECTION
def __init__(self, pdf_path): """ :param pdf_path: file path for the PDF file """ # raw data self.pdf_path = pdf_path pn = get_page_num(pdf_path) print "# of Pages: ", pn # batch processing here # * font self.font = None # the font is not that useful for now. # * stage4_font_stat stage4_font_stat(pdf_path) debug_info("DONE font stat for ME extraction") # * batch ime & eme # Not using the parallel processing as # there might be a bug here causing the failure of cache checking extract_me(pdf_path) debug_info("DONE ime & eme extraction") # raw layout pages self.pages = [LayoutPage(pdf_path, pid) for pid in range(pn)] # processed data self.sections = [] # plain sections self.root_sections = [ ] # the root of sections after the hierarchy is built self.create_plain_section() self._create_section_hierarchy()
def extraction_done(pdf_path): pn = get_page_num(pdf_path) xml_path = get_xml_path(pdf_path) for pid in range(pn): eme_path = "{}.eme.{}.xml".format(xml_path, pid) ime_path = "{}.ime.{}.xml".format(xml_path, pid) if not os.path.isfile(eme_path): print("EME {} not exist".format(eme_path)) return False if not os.path.isfile(ime_path): print("IME {} not exist".format(ime_path)) return False return True
def get_glyph_ratio(pdf_path, pid): """ TODO, what the return should be like? :param pdf_path: :param pid: :return: page to fontname 2 glyphname 2 pair/tuple """ raise Exception("Should not call it") import shutil from pdfxml.path_util import get_tmp_path from pdfxml.pdf_util.pdf_extract import get_page_num tmp_pdf_path = get_tmp_path(pdf_path) if tmp_pdf_path == pdf_path or os.path.isfile(tmp_pdf_path): pass else: shutil.copy(pdf_path, tmp_pdf_path) pn = get_page_num(pdf_path) all_create = True # check whether all created for i in range(pn): gr_path = "{}.glyphratio.{}.txt".format(tmp_pdf_path, i) if not os.path.isfile(gr_path): all_create = False break if not all_create: export_glyph_ratio(pdf_path) # read from the files and return here #page2fontname2glyphname2adjust = {} #for pid in range(pn): fontname2glyphname2adjust = {} gr_path = "{}.glyphratio.{}.txt".format(tmp_pdf_path, pid) lines = open(gr_path).readlines() for line in lines: line = line.strip() if line == "": continue ws = line.strip().split("\t") fontname, glyphname, up_ratio, lower_ratio = \ ws[0], ws[1], float(ws[2]), float(ws[3]) if fontname not in fontname2glyphname2adjust: fontname2glyphname2adjust[fontname] = {} fontname2glyphname2adjust[fontname][glyphname] = (up_ratio, lower_ratio) #page2fontname2glyphname2adjust[pid] = fontname2glyphname2adjust #return page2fontname2glyphname2adjust return fontname2glyphname2adjust
def extract_eme(pdf_path): """ :param pdf_path: :param xml_path: :return: """ pn = get_page_num(pdf_path) prev_page_info = {} for pid in range(pn): eme_export_path = get_eme_path(pdf_path, pid) if not os.path.isfile(eme_export_path): prev_page_info = EME_font_stat_pipeline( pdf_path, pid, eme_export_path=eme_export_path, prev_page_info=prev_page_info)
def get_char_dist_est(pdf_path): """ get the parameter estimation for brand new word segmenter :param pdf_path: :return: """ pn = get_page_num(pdf_path) within_dist_list = [] between_dist_list = [] for pid in range(pn): #lines = ppc_line_reunion(pdf_path, pid) lines = process_pdf_lines(pdf_path, pid) # for each line, group into list of words for line in lines: wi_list, bn_list = get_char_dist_est_line(line) within_dist_list.extend(wi_list) between_dist_list.extend(bn_list) return within_dist_list, between_dist_list
def get_ignore_region(pdf_path): """ :param pdf_path: :return: return pid2intervaltree """ from pdfxml.me_extraction.me_font_stat_stage4 import internal_get_llines pn = get_page_num(pdf_path) pid2it2d = {} all_lines = [] for pid in range(pn): lines = internal_get_llines(None, pdf_path, pid) for line in lines: line_str = char_list2str(line) is_heading = is_heading_line_by_str(line_str) all_lines.append({ 'char_list': line, 'line_str': line_str, 'pid': pid, 'is_heading': is_heading }) # first process the abstract part abstract_heading_line_idx = None for i, line_info in enumerate(all_lines): if line_info['is_heading'] and \ is_abstract_head(line_info['line_str']): abstract_heading_line_idx = i break if line_info['pid'] > pn / 2: # should not be at the second half of the document. break if abstract_heading_line_idx is not None: for i in range(abstract_heading_line_idx): pid = all_lines[i]['pid'] if pid not in pid2it2d: pid2it2d[pid] = IntervalTree2D() pid2it2d[pid].add_bbox_only( char_list2bbox(all_lines[i]['char_list'])) ignore_begin = False for line_info in all_lines: # ignore the abstraction and the reference if line_info['is_heading']: abs_head = is_abstract_head(line_info['line_str']) ref_head = is_reference_head(line_info['line_str']) if abs_head or ref_head: ignore_begin = True else: ignore_begin = False else: if ignore_begin: pid = line_info['pid'] if pid not in pid2it2d: pid2it2d[pid] = IntervalTree2D() pid2it2d[pid].add_bbox_only( char_list2bbox(line_info['char_list'])) return pid2it2d
def extract_me(pdf_path): pdf_name = get_file_name_prefix(pdf_path) # TODO, place it outside duration_recorder.begin_timer("Begin ME Extraction") # load the setting here. tmp_pdf_path = get_tmp_path(pdf_path) if not os.path.isfile(tmp_pdf_path): shutil.copy(pdf_path, tmp_pdf_path) if extraction_done(pdf_path): print "ME extraction done for {}".format(pdf_path) return #if ext_settings.debug: # convert2image(pdf_path) # batch extraction of lines pn = get_page_num(pdf_path) if ext_settings.debug: # the font is not useful in later stage #get_font_from_pdf(pdf_path, 0) # just do it once, other wise, the parallel error? pass duration_recorder.begin_timer("Column-Line-Word") # This part should not be parallelized, only execute once export_exact_position(pdf_path) if ext_settings.ME_ANALYSIS_PARALLEL: print "parallized CLW threading" from pdfxml.pdf_util.clw_pipeline import clw_pdf_lines #Parallel(n_jobs=PARALLEL_SIZE)( # delayed(clw_pdf_lines)(pdf_path, pid) for pid in range(pn)) from multiprocessing import Process process_list = [] for pid in range(pn): p = Process(target=clw_pdf_lines, args=(pdf_path, pid)) p.start() process_list.append(p) for p in process_list: p.join() else: print "serialized CLW" if ext_settings.CLW_VERSION == CLW_OLD: from pdfxml.pdf_util.ppc_line_reunion import ppc_line_reunion for pid in range(pn): ppc_line_reunion(pdf_path, pid) elif ext_settings.CLW_VERSION == CLW_FEB: from pdfxml.pdf_util.clw_pipeline import clw_pdf_lines for pid in range(pn): clw_pdf_lines(pdf_path, pid) else: raise Exception("unknown version") duration_recorder.begin_timer("IME Extraction") extract_ime(pdf_path) duration_recorder.begin_timer("EME Extraction") extract_eme(pdf_path) duration_recorder.begin_timer("ME extraction finished")