Пример #1
0
    def create_annotation_for_annotator(self,
                                        anno_files: AnnotationFiles) -> None:

        # Firstly initialize the annotation tables with the annotator name
        annotator = anno_files.annotator
        for token_data in self.all_page_token_data.values():
            token_data[annotator] = None

        pbar = tqdm(anno_files)

        for anno_file in pbar:
            paper_sha = anno_file["paper_sha"]
            df = self.all_page_token_data[paper_sha]

            pawls_annotations = load_json(
                anno_file["annotation_path"])["annotations"]
            for anno in pawls_annotations:
                if anno["tokens"] is None:
                    continue

                # Skip if current category is not in the specified categories
                label = anno["label"]["text"]
                if label not in self.categories:
                    continue

                anno_token_indices = [(ele["pageIndex"], ele["tokenIndex"])
                                      for ele in anno["tokens"]]

                df.loc[anno_token_indices, annotator] = label
Пример #2
0
    def load_directory(self, pred_dir: str) -> Dict[str, Dict]:

        pdf_preds = {}

        for pred_json in glob(f"{pred_dir}/*.json"):

            filename = os.path.basename(pred_json).replace(".json", ".pdf")
            self.pdf_preds[filename] = load_json(pred_json)

        return pdf_preds
Пример #3
0
    def create_annotation_for_annotator(self,
                                        anno_files: AnnotationFiles) -> None:
        """Create the annotations for the given annotation files"""

        _annotations = []
        anno_id = 0
        pbar = tqdm(anno_files)
        for anno_file in pbar:

            paper_sha = anno_file["paper_sha"]

            pbar.set_description(f"Working on {paper_sha[:10]}...")
            pawls_annotations = load_json(
                anno_file["annotation_path"])["annotations"]

            for anno in pawls_annotations:
                page_id = anno["page"]
                category = anno["label"]["text"]

                # Skip if current category is not in the specified categories
                cat_id = self._name2catid.get(category, None)
                if cat_id is None:
                    continue

                image_data = self.get_image_data(paper_sha, page_id)
                width, height = image_data["width"], image_data["height"]

                x, y, w, h = _convert_bounds_to_coco_bbox(anno["bounds"])

                _annotations.append(
                    self.AnnoTemplate(
                        id=anno_id,
                        bbox=[x, y, w, h],
                        category_id=cat_id,
                        image_id=image_data["id"],
                        area=w * h,
                    )._asdict())
                anno_id += 1

        return _annotations
Пример #4
0
    def __init__(self, pred_file: str):

        if os.path.isfile(pred_file):
            self.pdf_preds = load_json(pred_file)
        elif os.path.isdir(pred_file):
            self.pdf_preds = self.load_directory(pred_file)