def create_annotation_for_annotator(self, anno_files: AnnotationFiles) -> None: # Firstly initialize the annotation tables with the annotator name annotator = anno_files.annotator for token_data in self.all_page_token_data.values(): token_data[annotator] = None pbar = tqdm(anno_files) for anno_file in pbar: paper_sha = anno_file["paper_sha"] df = self.all_page_token_data[paper_sha] pawls_annotations = load_json( anno_file["annotation_path"])["annotations"] for anno in pawls_annotations: if anno["tokens"] is None: continue # Skip if current category is not in the specified categories label = anno["label"]["text"] if label not in self.categories: continue anno_token_indices = [(ele["pageIndex"], ele["tokenIndex"]) for ele in anno["tokens"]] df.loc[anno_token_indices, annotator] = label
def load_directory(self, pred_dir: str) -> Dict[str, Dict]: pdf_preds = {} for pred_json in glob(f"{pred_dir}/*.json"): filename = os.path.basename(pred_json).replace(".json", ".pdf") self.pdf_preds[filename] = load_json(pred_json) return pdf_preds
def create_annotation_for_annotator(self, anno_files: AnnotationFiles) -> None: """Create the annotations for the given annotation files""" _annotations = [] anno_id = 0 pbar = tqdm(anno_files) for anno_file in pbar: paper_sha = anno_file["paper_sha"] pbar.set_description(f"Working on {paper_sha[:10]}...") pawls_annotations = load_json( anno_file["annotation_path"])["annotations"] for anno in pawls_annotations: page_id = anno["page"] category = anno["label"]["text"] # Skip if current category is not in the specified categories cat_id = self._name2catid.get(category, None) if cat_id is None: continue image_data = self.get_image_data(paper_sha, page_id) width, height = image_data["width"], image_data["height"] x, y, w, h = _convert_bounds_to_coco_bbox(anno["bounds"]) _annotations.append( self.AnnoTemplate( id=anno_id, bbox=[x, y, w, h], category_id=cat_id, image_id=image_data["id"], area=w * h, )._asdict()) anno_id += 1 return _annotations
def __init__(self, pred_file: str): if os.path.isfile(pred_file): self.pdf_preds = load_json(pred_file) elif os.path.isdir(pred_file): self.pdf_preds = self.load_directory(pred_file)