def __init__(self, item): assert not hasattr(item, 'attributes') self._item = item self.text = re.sub('[^\x20-\x7E]', '', remove_ligatures(unicode(item.get_text()).strip())).encode('utf8') self.yoffset = item.yoffset self.x0 = item.x0 self.x1 = item.x1 self.y0 = item.y0 self.y1 = item.y0 + item.height # item.y1 is often unreliable assert self.x0 <= self.x1 and self.y0 <= self.y1 self.height = item.height self.width = item.width self.style = {} self.attributes = {} self.abstract = bool(re.findall('^abstract', self.text, flags=re.I)) self.fontsize = int(item.height) self.fontname = 'unknown' self.children = [c for c in item if hasattr(c, 'fontname')] if self.children: # Use height of the character bbox as font size, which might be better # because it invariant to font type (but can be worse if it's # incorrectly reported by pdfminer). # take most frequent font name and size self.fontsize = Counter(int(c.height) for c in self.children).most_common()[0][0] self.fontname = Counter(c.fontname for c in self.children).most_common()[0][0]
def extract_plaintext(self): "Extract plaintext from filename. Returns text, might cache." if self.cached.endswith('.pdf'): # extract text from pdfs text = pdftotext(self.cached, output=self.d / 'data' / 'pdftotext.txt', verbose=True, usecached=True) else: text = robust_read(self.cached) text = force_unicode(text) text = htmltotext(text) # clean up html text = remove_ligatures(text) return self.store('data/text', text, overwrite=True)