예제 #1
0
    def setUp(self):
        pdfannots.COLUMNS_PER_PAGE = self.columns_per_page

        path = pathlib.Path(__file__).parent / 'tests' / self.filename
        with path.open('rb') as f:
            (annots, outlines) = pdfannots.process_file(f)
            self.annots = annots
            self.outlines = outlines
예제 #2
0
def get_annots(p: Path) -> List[Annotation]:
    b = time.time()
    with p.open('rb') as fo:
        doc = pdfannots.process_file(fo, emit_progress_to=None)
        annots = [a for a in doc.iter_annots()]
        # also has outlines are kinda like TOC, I don't really need them
    a = time.time()
    took = a - b
    tooks = f'took {took:0.1f} seconds'
    if took > 5:
        tooks = tooks.upper()
    logger.debug('extracting %s %s: %d annotations', tooks, p, len(annots))
    return [_as_annotation(raw=a, path=str(p)) for a in annots]
예제 #3
0
def get_annots(p: Path) -> List[Annotation]:
    b = time.time()
    with p.open('rb') as fo:
        f = io.StringIO()
        with redirect_stderr(f):
            # FIXME
            (annots, outlines) = pdfannots.process_file(fo, emit_progress=False)
            # outlines are kinda like TOC, I don't really need them
    a = time.time()
    took = a - b
    tooks = f'took {took:0.1f} seconds'
    if took > 5:
        tooks = tooks.upper()
    logger.debug('extracting %s %s: %d annotations', tooks, p, len(annots))
    return [as_annotation(raw_ann=a, path=str(p)) for a in annots]
예제 #4
0
from pdfannots import process_file, PrettyPrinter
from colr import color as term_color
from collections import Counter

input_path = r"tests\hotos17.pdf"
# input_path = r"tests\issue9.pdf"
# input_path = r"tests\issue13.pdf"
# input_path = r"tests\pr24.pdf"

annots, outlines = process_file(open(input_path, 'rb'), emit_progress=True)

pp = PrettyPrinter(outlines, wrapcol=None, condense=True)
data = pp.return_all(annots)

all_ct = list(tuple([item.tagname] + item.selection_colour) for item in data)

classes = {}
for index, data_ in enumerate(Counter(all_ct).most_common()):
    rgb = data_[0][1:]
    class_n = data_[0]
    cnt = data_[1]
    classes[class_n] = index
    print("class:", index, "count:", cnt,
          term_color(class_n, fore=(0, 0, 0), back=rgb))
print()

for d in data:
    key = tuple([d.tagname] + d.selection_colour)
    to_print = f"class:     :  {classes[key]}" \
           f"\ntext       :{d.text}" \
           f"\ncomment    :{d.comment}" \