示例#1
0
def extract_ime(pdf_path):
    # IME [1]
    """
    given a pdf path, extract the ime for each page
    """
    xml_path = get_xml_path(pdf_path)
    pn = get_page_num(pdf_path)

    if not ext_settings.ME_ANALYSIS_PARALLEL:
        for pid in range(pn):
            # prepare the path

            xml_out_path = get_ime_path(pdf_path, pid)
            if not os.path.isfile(xml_out_path):
                assess_ime(pdf_path, pid, xml_out_path)
    else:
        from multiprocessing import Process
        process_list = []
        for pid in range(pn):
            p = Process(target=assess_ime,
                        args=(pdf_path, pid, get_ime_path(pdf_path, pid)))
            process_list.append(p)
            p.start()
        for p in process_list:
            p.join()
示例#2
0
    def __init__(self, pdf_path, pid):
        """

        :param pdf_path:
        :type pdf_path: str
        :param pid:
        :type pid: int
        """
        pn = get_page_num(pdf_path)
        if pid >= pn:
            raise Exception("page size exceed")

        self.pdf_path = pdf_path
        self.pid = pid
        self.llines = []  # LayoutLine list

        char_list_list = internal_get_llines(None, pdf_path, pid)
        for char_list in char_list_list:
            line = LayoutLine(char_list, pid)
            self.llines.append(line)

        ime_bbox_list = load_ime_bbox_list(pdf_path, pid)
        for ll in self.llines:
            ":type ll:LayoutLine"
            if bbox_half_overlap_list(ll.get_bbox(), ime_bbox_list):
                ll.type = LINE_IME

            # TODO, check heading
            # TODO, customized for each conference
            if ll.check_line_section_title():
                ll.type = LINE_TYPE_SECTION
示例#3
0
    def __init__(self, pdf_path):
        """
        :param pdf_path: file path for the PDF file
        """
        # raw data
        self.pdf_path = pdf_path

        pn = get_page_num(pdf_path)
        print "# of Pages: ", pn
        # batch processing here
        # * font
        self.font = None  # the font is not that useful for now.

        # * stage4_font_stat
        stage4_font_stat(pdf_path)
        debug_info("DONE font stat for ME extraction")

        # * batch ime & eme
        # Not using the parallel processing as
        # there might be a bug here causing the failure of cache checking
        extract_me(pdf_path)
        debug_info("DONE ime & eme extraction")

        # raw layout pages
        self.pages = [LayoutPage(pdf_path, pid) for pid in range(pn)]

        # processed data
        self.sections = []  # plain sections
        self.root_sections = [
        ]  # the root of sections after the hierarchy is built
        self.create_plain_section()
        self._create_section_hierarchy()
示例#4
0
def extraction_done(pdf_path):
    pn = get_page_num(pdf_path)
    xml_path = get_xml_path(pdf_path)
    for pid in range(pn):
        eme_path = "{}.eme.{}.xml".format(xml_path, pid)
        ime_path = "{}.ime.{}.xml".format(xml_path, pid)
        if not os.path.isfile(eme_path):
            print("EME {} not exist".format(eme_path))
            return False
        if not os.path.isfile(ime_path):
            print("IME {} not exist".format(ime_path))
            return False
    return True
示例#5
0
def get_glyph_ratio(pdf_path, pid):
    """
    TODO, what the return should be like?

    :param pdf_path:
    :param pid:
    :return: page to fontname 2 glyphname 2 pair/tuple
    """
    raise Exception("Should not call it")

    import shutil
    from pdfxml.path_util import get_tmp_path
    from pdfxml.pdf_util.pdf_extract import get_page_num
    tmp_pdf_path = get_tmp_path(pdf_path)
    if tmp_pdf_path == pdf_path or os.path.isfile(tmp_pdf_path):
        pass
    else:
        shutil.copy(pdf_path, tmp_pdf_path)
    pn = get_page_num(pdf_path)
    all_create = True  # check whether all created
    for i in range(pn):
        gr_path = "{}.glyphratio.{}.txt".format(tmp_pdf_path, i)
        if not os.path.isfile(gr_path):
            all_create = False
            break
    if not all_create:
        export_glyph_ratio(pdf_path)

    # read from the files and return here
    #page2fontname2glyphname2adjust = {}
    #for pid in range(pn):
    fontname2glyphname2adjust = {}
    gr_path = "{}.glyphratio.{}.txt".format(tmp_pdf_path, pid)
    lines = open(gr_path).readlines()
    for line in lines:
        line = line.strip()
        if line == "":
            continue
        ws = line.strip().split("\t")
        fontname, glyphname, up_ratio, lower_ratio = \
            ws[0], ws[1], float(ws[2]), float(ws[3])
        if fontname not in fontname2glyphname2adjust:
            fontname2glyphname2adjust[fontname] = {}
        fontname2glyphname2adjust[fontname][glyphname] = (up_ratio,
                                                          lower_ratio)

        #page2fontname2glyphname2adjust[pid] = fontname2glyphname2adjust
    #return page2fontname2glyphname2adjust

    return fontname2glyphname2adjust
示例#6
0
def extract_eme(pdf_path):
    """

    :param pdf_path:
    :param xml_path:
    :return:
    """
    pn = get_page_num(pdf_path)
    prev_page_info = {}
    for pid in range(pn):
        eme_export_path = get_eme_path(pdf_path, pid)
        if not os.path.isfile(eme_export_path):
            prev_page_info = EME_font_stat_pipeline(
                pdf_path,
                pid,
                eme_export_path=eme_export_path,
                prev_page_info=prev_page_info)
示例#7
0
def get_char_dist_est(pdf_path):
    """
    get the parameter estimation for brand new word segmenter

    :param pdf_path:
    :return:
    """

    pn = get_page_num(pdf_path)
    within_dist_list = []
    between_dist_list = []
    for pid in range(pn):
        #lines = ppc_line_reunion(pdf_path, pid)
        lines = process_pdf_lines(pdf_path, pid)
        # for each line, group into list of words
        for line in lines:
            wi_list, bn_list = get_char_dist_est_line(line)
            within_dist_list.extend(wi_list)
            between_dist_list.extend(bn_list)
    return within_dist_list, between_dist_list
示例#8
0
def get_ignore_region(pdf_path):
    """
    :param pdf_path:
    :return:
        return pid2intervaltree
    """
    from pdfxml.me_extraction.me_font_stat_stage4 import internal_get_llines
    pn = get_page_num(pdf_path)

    pid2it2d = {}

    all_lines = []
    for pid in range(pn):
        lines = internal_get_llines(None, pdf_path, pid)
        for line in lines:
            line_str = char_list2str(line)
            is_heading = is_heading_line_by_str(line_str)
            all_lines.append({
                'char_list': line,
                'line_str': line_str,
                'pid': pid,
                'is_heading': is_heading
            })

    # first process the abstract part
    abstract_heading_line_idx = None
    for i, line_info in enumerate(all_lines):
        if line_info['is_heading'] and \
                is_abstract_head(line_info['line_str']):
            abstract_heading_line_idx = i
            break
        if line_info['pid'] > pn / 2:
            # should not be at the second half of the document.
            break

    if abstract_heading_line_idx is not None:
        for i in range(abstract_heading_line_idx):
            pid = all_lines[i]['pid']
            if pid not in pid2it2d:
                pid2it2d[pid] = IntervalTree2D()
            pid2it2d[pid].add_bbox_only(
                char_list2bbox(all_lines[i]['char_list']))

    ignore_begin = False
    for line_info in all_lines:
        # ignore the abstraction and the reference
        if line_info['is_heading']:
            abs_head = is_abstract_head(line_info['line_str'])
            ref_head = is_reference_head(line_info['line_str'])
            if abs_head or ref_head:
                ignore_begin = True
            else:
                ignore_begin = False
        else:
            if ignore_begin:
                pid = line_info['pid']
                if pid not in pid2it2d:
                    pid2it2d[pid] = IntervalTree2D()
                pid2it2d[pid].add_bbox_only(
                    char_list2bbox(line_info['char_list']))
    return pid2it2d
示例#9
0
def extract_me(pdf_path):

    pdf_name = get_file_name_prefix(pdf_path)

    # TODO, place it outside

    duration_recorder.begin_timer("Begin ME Extraction")
    # load the setting here.
    tmp_pdf_path = get_tmp_path(pdf_path)

    if not os.path.isfile(tmp_pdf_path):
        shutil.copy(pdf_path, tmp_pdf_path)

    if extraction_done(pdf_path):
        print "ME extraction done for {}".format(pdf_path)
        return

    #if ext_settings.debug:
    #    convert2image(pdf_path)

    # batch extraction of lines
    pn = get_page_num(pdf_path)

    if ext_settings.debug:
        # the font is not useful in later stage
        #get_font_from_pdf(pdf_path, 0)  # just do it once, other wise, the parallel error?
        pass

    duration_recorder.begin_timer("Column-Line-Word")
    # This part should not be parallelized, only execute once

    export_exact_position(pdf_path)
    if ext_settings.ME_ANALYSIS_PARALLEL:
        print "parallized CLW threading"
        from pdfxml.pdf_util.clw_pipeline import clw_pdf_lines

        #Parallel(n_jobs=PARALLEL_SIZE)(
        #    delayed(clw_pdf_lines)(pdf_path, pid) for pid in range(pn))
        from multiprocessing import Process
        process_list = []
        for pid in range(pn):
            p = Process(target=clw_pdf_lines, args=(pdf_path, pid))
            p.start()
            process_list.append(p)

        for p in process_list:
            p.join()
    else:
        print "serialized CLW"
        if ext_settings.CLW_VERSION == CLW_OLD:
            from pdfxml.pdf_util.ppc_line_reunion import ppc_line_reunion
            for pid in range(pn):
                ppc_line_reunion(pdf_path, pid)
        elif ext_settings.CLW_VERSION == CLW_FEB:
            from pdfxml.pdf_util.clw_pipeline import clw_pdf_lines
            for pid in range(pn):
                clw_pdf_lines(pdf_path, pid)
        else:
            raise Exception("unknown version")

    duration_recorder.begin_timer("IME Extraction")
    extract_ime(pdf_path)
    duration_recorder.begin_timer("EME Extraction")
    extract_eme(pdf_path)
    duration_recorder.begin_timer("ME extraction finished")