def test_djvu_extract_leaf_texts() -> None:

    # https://archive.org/details/ERIC_ED441501
    with open("tests/files/ERIC_ED441501_djvu.xml", "r") as f:
        blob = f.read()

    leaves = djvu_extract_leaf_texts(io.StringIO(blob), [3, 6])
    assert 3 in leaves
    assert 6 in leaves
    assert "2. Original cataloging tools" in leaves[3]
    assert len(leaves) == 2
Exemplo n.º 2
0
    def fetch_sim(
        self,
        issue_db_row: SimIssueRow,
        issue_db_pub_row: SimPubRow,
        pages: str,
        release_ident: str,
    ) -> Optional[Any]:
        """
        issue_item
        pages: str
        page_texts: list
            page_num
            leaf_num
            raw_text
        release_ident: Optional[str]
        pub_item_metadata
        issue_item_metadata
        """

        first_page, last_page = parse_pages(pages)
        if first_page is None:
            return None

        # fetch full metadata from API
        issue_meta = self.ia_client.get_metadata(issue_db_row.issue_item)
        pub_meta = self.ia_client.get_metadata(issue_db_pub_row.pub_collection)

        leaf_index = dict()
        leaf_list = []
        if not "page_numbers" in issue_meta:
            # TODO: warn
            return None
        for entry in issue_meta["page_numbers"].get("pages", []):
            page_num = entry["pageNumber"]
            leaf_index[entry["leafNum"]] = page_num
            if not (page_num and page_num.isdigit()):
                continue
            page_num = int(page_num)
            if page_num >= first_page and page_num <= last_page:
                leaf_list.append(entry["leafNum"])

        if not leaf_list:
            return None

        page_texts: List[Dict[str, Any]] = []
        issue_item = self.ia_client.get_item(issue_db_row.issue_item)
        issue_item_djvu = issue_item.get_file(issue_db_row.issue_item +
                                              "_djvu.xml")

        # override 'close()' method so we can still read out contents
        djvu_bytes = io.BytesIO()
        djvu_bytes.close = lambda: None  # type: ignore
        assert issue_item_djvu.download(fileobj=djvu_bytes)
        djvu_bytes.seek(0)
        djvu_xml = io.StringIO(djvu_bytes.read().decode("UTF-8"))
        del djvu_bytes

        leaf_dict = djvu_extract_leaf_texts(djvu_xml, only_leaves=leaf_list)

        for leaf_num, raw_text in leaf_dict.items():
            page_texts.append(
                dict(
                    page_num=leaf_index.get(leaf_num),
                    leaf_num=leaf_num,
                    raw_text=raw_text,
                ))

        return dict(
            issue_item=issue_db_row.issue_item,
            pages=pages,
            page_texts=page_texts,
            release_ident=release_ident,
            pub_item_metadata=truncate_pub_meta(pub_meta),
            issue_item_metadata=truncate_issue_meta(issue_meta),
        )
Exemplo n.º 3
0
    def fetch_sim_issue(self, issue_item: str,
                        pub_collection: str) -> Optional[Any]:
        """
        issue_item
        pages: str
        page_texts: list
            raw_text
            page_num
            leaf_num
        release_ident: Optional[str]
        pub_item_metadata
        issue_item_metadata
        """
        # fetch full metadata from API
        issue_meta = self.ia_client.get_metadata(issue_item)
        pub_meta = self.ia_client.get_metadata(pub_collection)

        leaf_index = dict()
        leaf_list = []
        if "page_numbers" not in issue_meta:
            print(f"issue without page_numbers: {issue_item}", file=sys.stderr)
            return None
        for entry in issue_meta["page_numbers"].get("pages", []):
            page_num = entry["pageNumber"]
            leaf_index[entry["leafNum"]] = page_num
            if not (page_num and page_num.isdigit()):
                continue
            page_num = int(page_num)
            leaf_list.append(entry["leafNum"])

        if not leaf_list:
            print(f"issue without leaf numbers: {issue_item}", file=sys.stderr)
            return None

        page_texts: List[Dict[str, Any]] = []
        issue_item_obj = self.ia_client.get_item(issue_item)
        issue_item_djvu = issue_item_obj.get_file(issue_item + "_djvu.xml")

        # override 'close()' method so we can still read out contents
        djvu_bytes = io.BytesIO()
        djvu_bytes.close = lambda: None  # type: ignore
        assert issue_item_djvu.download(fileobj=djvu_bytes)
        djvu_bytes.seek(0)
        djvu_xml = io.StringIO(djvu_bytes.read().decode("UTF-8"))
        del djvu_bytes

        leaf_dict = djvu_extract_leaf_texts(djvu_xml)

        for leaf_num, raw_text in leaf_dict.items():
            page_texts.append(
                dict(
                    page_num=leaf_index.get(leaf_num),
                    leaf_num=leaf_num,
                    raw_text=raw_text,
                ))

        return dict(
            issue_item=issue_item,
            pages=None,
            page_texts=page_texts,
            release_ident=None,
            pub_item_metadata=truncate_pub_meta(pub_meta),
            issue_item_metadata=truncate_issue_meta(issue_meta),
        )