예제 #1
0
    def __prepare_display(self):
        frames = []
        lines = []
        for rect in self.ltRects:
            if (rect.horizontal()
                    and rect.height() > 8) or (rect.vertical()
                                               and rect.width() > 8):
                table = SingleCellTable([])
                table.rect = rect
                frames.append(table)
            else:
                lines.append(rect)

        orphans = []
        while len(lines) > 0:
            cluster = pdftable.cluster_rects(lines)
            if len(cluster) >= 4:
                try:
                    frames.append(pdftable.Table(cluster))
                    continue
                except:
                    pass
            orphans += cluster

        curves = sorted(self.curves +
                        [pdftable.Curve(o.points()) for o in orphans],
                        key=cmp_to_key(sort_topdown_ltr))
        textLines = sorted(self.textLines, key=cmp_to_key(sort_topdown_ltr))

        # explicit tables
        tables = []
        for table in frames:
            orphans = []
            bounds = table.bounds()
            for i in range(0, len(textLines)):
                line = textLines[i]
                if bounds.contains(line.bounds()):
                    # Some pages have their "NOTES" section embedded inside the
                    # table rectangle. What were you thinking, Intel?
                    if line.font_name() == "NeoSansIntelMedium" and str(
                            line).lower().startswith("notes"):
                        orphans += textLines[i:]
                        break
                    table.get_at_pixel(line.rect.xmid(),
                                       line.rect.ymid()).append(line)
                else:
                    orphans.append(line)
            textLines = orphans
            tables.append(table)

        # exception tables
        orphans = []
        table_data = []
        is_table_section = False
        expected_format = None
        for line in textLines:
            if line.font_name() == "NeoSansIntelMedium":
                orphans.append(line)
                title = str(line).strip().lower()
                if title[-10:] == "exceptions":
                    is_table_section = True
                    expected_format = exceptions_format__
                elif title == "fpu flags affected":
                    is_table_section = True
                    expected_format = fpu_flags_format__
                if is_table_section and len(table_data) > 0:
                    tables.append(SingleCellTable(table_data))
                    table_data = []
                continue

            if is_table_section:
                if line.bounds().x1() > 50:
                    table_data.append(line)
                elif expected_format.search(str(line)) == None:
                    orphans.append(line)
                    if len(table_data) > 0:
                        tables.append(SingleCellTable(table_data))
                        table_data = []
                else:
                    table_data.append(line)
            else:
                orphans.append(line)

        if len(table_data) > 0:
            tables.append(SingleCellTable(table_data))

        # tables versus figures (versus useless frames)
        all_tables = sorted(tables, key=lambda x: x.bounds().area())
        tables = set()
        figures = set()
        sublevel_figures = set()
        i = 0
        while i < len(all_tables):
            smaller = all_tables[i]
            if smaller.rows() != 1 or smaller.columns() != 1:
                tables.add(smaller)
            else:
                j = i + 1
                smaller_bounds = smaller.bounds()
                while j < len(all_tables):
                    bigger = all_tables[j]
                    if bigger.bounds().contains(smaller_bounds):
                        bigger.get_at_pixel(
                            smaller_bounds.xmid(),
                            smaller_bounds.ymid()).append(smaller)
                        figures.add(bigger)
                        figures.add(smaller)
                        sublevel_figures.add(smaller)
                        break
                    j += 1
                else:
                    tables.add(smaller)
            i += 1

        top_figures = [Figure(t) for t in figures - sublevel_figures]
        top_tables = list(tables - figures)

        orphanCurves = []
        for figure in top_figures:
            for curve in curves:
                if figure.bounds().contains(curve.bounds()):
                    figure.data.get_at(0, 0).append(curve)
                else:
                    orphanCurves.append(curve)
            curves = orphanCurves
            orphanCurves = []
        i = 0
        while i < len(top_tables):
            count = top_tables[i].item_count()
            if count > 1: i += 1
            else:
                if count == 1:
                    orphans += top_tables[i].get_at(0, 0)
                top_tables.pop(i)

        # lists
        textLines = self.__merge_text(orphans)
        orphans = []
        lists = []
        this_list = []
        i = 0
        while i < len(textLines):
            line = textLines[i]
            if line.chars[0].get_text() == "•":
                if len(line.chars) == 1:
                    i += 1
                    line = textLines[i]
                else:
                    for j in range(1, len(line.chars)):
                        if not line.chars[j].get_text().isspace(): break
                    line.chars = line.chars[j:]
                this_list.append(line)
            else:
                if len(this_list) > 0:
                    lists.append(pdftable.List(this_list))
                    this_list = []
                orphans.append(line)
            i += 1

        displayable = self.__merge_text(orphans) + top_tables + top_figures
        displayable.sort(key=cmp_to_key(sort_topdown_ltr))
        return displayable
예제 #2
0
 def process_curve(self, curve):
     curve = pdftable.Curve([self.__fix_point(p) for p in curve.pts])
     self.curves.append(curve)