def setUp(self): pdfannots.COLUMNS_PER_PAGE = self.columns_per_page path = pathlib.Path(__file__).parent / 'tests' / self.filename with path.open('rb') as f: (annots, outlines) = pdfannots.process_file(f) self.annots = annots self.outlines = outlines
def get_annots(p: Path) -> List[Annotation]: b = time.time() with p.open('rb') as fo: doc = pdfannots.process_file(fo, emit_progress_to=None) annots = [a for a in doc.iter_annots()] # also has outlines are kinda like TOC, I don't really need them a = time.time() took = a - b tooks = f'took {took:0.1f} seconds' if took > 5: tooks = tooks.upper() logger.debug('extracting %s %s: %d annotations', tooks, p, len(annots)) return [_as_annotation(raw=a, path=str(p)) for a in annots]
def get_annots(p: Path) -> List[Annotation]: b = time.time() with p.open('rb') as fo: f = io.StringIO() with redirect_stderr(f): # FIXME (annots, outlines) = pdfannots.process_file(fo, emit_progress=False) # outlines are kinda like TOC, I don't really need them a = time.time() took = a - b tooks = f'took {took:0.1f} seconds' if took > 5: tooks = tooks.upper() logger.debug('extracting %s %s: %d annotations', tooks, p, len(annots)) return [as_annotation(raw_ann=a, path=str(p)) for a in annots]
from pdfannots import process_file, PrettyPrinter from colr import color as term_color from collections import Counter input_path = r"tests\hotos17.pdf" # input_path = r"tests\issue9.pdf" # input_path = r"tests\issue13.pdf" # input_path = r"tests\pr24.pdf" annots, outlines = process_file(open(input_path, 'rb'), emit_progress=True) pp = PrettyPrinter(outlines, wrapcol=None, condense=True) data = pp.return_all(annots) all_ct = list(tuple([item.tagname] + item.selection_colour) for item in data) classes = {} for index, data_ in enumerate(Counter(all_ct).most_common()): rgb = data_[0][1:] class_n = data_[0] cnt = data_[1] classes[class_n] = index print("class:", index, "count:", cnt, term_color(class_n, fore=(0, 0, 0), back=rgb)) print() for d in data: key = tuple([d.tagname] + d.selection_colour) to_print = f"class: : {classes[key]}" \ f"\ntext :{d.text}" \ f"\ncomment :{d.comment}" \