def do_page_bounding_boxes (dirpath): textfilepath = os.path.join(dirpath, "contents.txt") wordbox_file = open(os.path.join(dirpath, "wordbboxes"), 'rb') pos_filepath = os.path.join(dirpath, "contents.ind") para_filepath = os.path.join(dirpath, "paragraphs.txt") note ("doing page bboxes for %s...", dirpath) if os.path.exists(pos_filepath): fp = open(pos_filepath, 'r') postags = POSTag.parse_parseinfo(fp) fp.close() else: postags = None bbox_iterator = wordboxes_page_iterator(dirpath) text_file = open(textfilepath, 'rb') firstline = text_file.readline() charsetmatch = CHARSETPATTERN.match(firstline) if charsetmatch: charsetname = charsetmatch.group(1) text_file.readline() first_byte = text_file.tell() else: charsetname = "latin_1" readlines = false first_byte = 0 if charsetname not in UTF8_ALIASES: raise ValueError("Charset in contents.txt must be UTF-8 for page bounding boxes to be created. Apparently it's %s, instead." % charsetname) text_file.seek(first_byte) paras = read_paragraphs_file(para_filepath) if paras: paras.sort(key=lambda x: x.first_byte) from createThumbnails import thumbnail_translation_and_scaling translation, scaling = thumbnail_translation_and_scaling (dirpath) note(4, " translation and scaling are %s and %s...", translation, scaling) def update_stats (stats, page_stats): if stats: stats += ", " stats += "%d:%.3f:%d:%d:%d:%d:%.3f" % (page_stats[0], ((page_stats[0] > 0) and float(page_stats[1])/float(page_stats[0]) or 0.0), page_stats[2], page_stats[3], page_stats[4], page_stats[5], ((page_stats[0] > 0) and float(page_stats[6])/float(page_stats[0]) or 0.0)) return stats page_index = 0 out_page_index = 0 last_cindex = 0 bboxes = [] postags_index = 0 stats = "" # accumulate stats doc_stats = [ 0, # number of words 0, # total length (in characters) 0, # number of bold words 0, # number of italic words 0, # number of bold-italic words 0, # number of fixed-width words 0.0, # total font sizes ] for page_index, bboxes in bbox_iterator: page_stats = [ 0, # number of words 0, # total length (in characters) 0, # number of bold words 0, # number of italic words 0, # number of bold-italic words 0, # number of fixed-width words 0.0, # total font sizes ] adjusted_bboxes = [] for bbox in bboxes: char_count = bbox.nchars() doc_stats[0] += 1 doc_stats[1] += bbox.nchars() if bbox.is_bold(): doc_stats[2] += 1 if bbox.is_italic(): doc_stats[3] += 1 if bbox.is_bold() and bbox.is_italic(): doc_stats[4] += 1 if bbox.is_fixedwidth(): doc_stats[5] += 1 doc_stats[6] += bbox.font_size() page_stats[0] += 1 page_stats[1] += bbox.nchars() if bbox.is_bold(): page_stats[2] += 1 if bbox.is_italic(): page_stats[3] += 1 if bbox.is_bold() and bbox.is_italic(): page_stats[4] += 1 if bbox.is_fixedwidth(): page_stats[5] += 1 page_stats[6] += bbox.font_size() cindex = bbox.contents_offset() tag = None if postags: # advance to first POS tag which might apply to cindex while ((postags_index < len(postags)) and (cindex >= (postags[postags_index].start + postags[postags_index].length))): postags_index = postags_index + 1 # might be cindex positions for which we have not tags -- check for that if ((postags_index < len(postags)) and (cindex >= postags[postags_index].start) and (cindex < (postags[postags_index].start + postags[postags_index].length))): tag = postags[postags_index] if paras and (paras[0].first_byte <= (cindex + char_count)) and (paras[0].first_byte_not >= cindex): # starts this paragraph if tag is None: tag = POSTag(cindex, char_count, None, "", True, False, False) else: tag.starts_paragraph = True paras = paras[1:] # again, add back in the 20-pixel border on the page ulx = trunc((bbox.left() + translation[0]) * scaling[0] + 0.5) uly = trunc((bbox.top() + translation[1]) * scaling[1] + 0.5) lrx = trunc((bbox.right() + translation[0]) * scaling[0] + 0.5) lry = trunc((bbox.bottom() + translation[1]) * scaling[1] + 0.5) adjusted_bboxes.append((bbox, tag, ulx, uly, lrx, lry)) last_cindex = cindex if (len(adjusted_bboxes) > 0): startpoint = adjusted_bboxes[0][0].contents_offset() endpoint = adjusted_bboxes[-1][0].contents_offset() + (adjusted_bboxes[-1][0].nchars() * 4) text_file.seek(startpoint + first_byte) pagetext = text_file.read(endpoint - startpoint) pagestart = startpoint else: pagetext = "" pagestart = last_cindex flush_page (dirpath, page_index, adjusted_bboxes, pagetext, pagestart) stats = update_stats(stats, page_stats) text_file.close() wordbox_file.close() dstats = update_stats("", doc_stats) update_metadata(os.path.join(dirpath, "metadata.txt"), { "wordbbox-stats-pagewise": stats, "wordbbox-stats-docwise": dstats})
def thumbnail_translation_and_scaling (self): import createThumbnails return createThumbnails.thumbnail_translation_and_scaling (self.folder())