def test_djvu_extract_leaf_texts() -> None: # https://archive.org/details/ERIC_ED441501 with open("tests/files/ERIC_ED441501_djvu.xml", "r") as f: blob = f.read() leaves = djvu_extract_leaf_texts(io.StringIO(blob), [3, 6]) assert 3 in leaves assert 6 in leaves assert "2. Original cataloging tools" in leaves[3] assert len(leaves) == 2
def fetch_sim( self, issue_db_row: SimIssueRow, issue_db_pub_row: SimPubRow, pages: str, release_ident: str, ) -> Optional[Any]: """ issue_item pages: str page_texts: list page_num leaf_num raw_text release_ident: Optional[str] pub_item_metadata issue_item_metadata """ first_page, last_page = parse_pages(pages) if first_page is None: return None # fetch full metadata from API issue_meta = self.ia_client.get_metadata(issue_db_row.issue_item) pub_meta = self.ia_client.get_metadata(issue_db_pub_row.pub_collection) leaf_index = dict() leaf_list = [] if not "page_numbers" in issue_meta: # TODO: warn return None for entry in issue_meta["page_numbers"].get("pages", []): page_num = entry["pageNumber"] leaf_index[entry["leafNum"]] = page_num if not (page_num and page_num.isdigit()): continue page_num = int(page_num) if page_num >= first_page and page_num <= last_page: leaf_list.append(entry["leafNum"]) if not leaf_list: return None page_texts: List[Dict[str, Any]] = [] issue_item = self.ia_client.get_item(issue_db_row.issue_item) issue_item_djvu = issue_item.get_file(issue_db_row.issue_item + "_djvu.xml") # override 'close()' method so we can still read out contents djvu_bytes = io.BytesIO() djvu_bytes.close = lambda: None # type: ignore assert issue_item_djvu.download(fileobj=djvu_bytes) djvu_bytes.seek(0) djvu_xml = io.StringIO(djvu_bytes.read().decode("UTF-8")) del djvu_bytes leaf_dict = djvu_extract_leaf_texts(djvu_xml, only_leaves=leaf_list) for leaf_num, raw_text in leaf_dict.items(): page_texts.append( dict( page_num=leaf_index.get(leaf_num), leaf_num=leaf_num, raw_text=raw_text, )) return dict( issue_item=issue_db_row.issue_item, pages=pages, page_texts=page_texts, release_ident=release_ident, pub_item_metadata=truncate_pub_meta(pub_meta), issue_item_metadata=truncate_issue_meta(issue_meta), )
def fetch_sim_issue(self, issue_item: str, pub_collection: str) -> Optional[Any]: """ issue_item pages: str page_texts: list raw_text page_num leaf_num release_ident: Optional[str] pub_item_metadata issue_item_metadata """ # fetch full metadata from API issue_meta = self.ia_client.get_metadata(issue_item) pub_meta = self.ia_client.get_metadata(pub_collection) leaf_index = dict() leaf_list = [] if "page_numbers" not in issue_meta: print(f"issue without page_numbers: {issue_item}", file=sys.stderr) return None for entry in issue_meta["page_numbers"].get("pages", []): page_num = entry["pageNumber"] leaf_index[entry["leafNum"]] = page_num if not (page_num and page_num.isdigit()): continue page_num = int(page_num) leaf_list.append(entry["leafNum"]) if not leaf_list: print(f"issue without leaf numbers: {issue_item}", file=sys.stderr) return None page_texts: List[Dict[str, Any]] = [] issue_item_obj = self.ia_client.get_item(issue_item) issue_item_djvu = issue_item_obj.get_file(issue_item + "_djvu.xml") # override 'close()' method so we can still read out contents djvu_bytes = io.BytesIO() djvu_bytes.close = lambda: None # type: ignore assert issue_item_djvu.download(fileobj=djvu_bytes) djvu_bytes.seek(0) djvu_xml = io.StringIO(djvu_bytes.read().decode("UTF-8")) del djvu_bytes leaf_dict = djvu_extract_leaf_texts(djvu_xml) for leaf_num, raw_text in leaf_dict.items(): page_texts.append( dict( page_num=leaf_index.get(leaf_num), leaf_num=leaf_num, raw_text=raw_text, )) return dict( issue_item=issue_item, pages=None, page_texts=page_texts, release_ident=None, pub_item_metadata=truncate_pub_meta(pub_meta), issue_item_metadata=truncate_issue_meta(issue_meta), )