def data(verbose=True): """ Get a list of skid pdfs which have authors annotated. """ for filename in iterview(CACHE.glob('*.pdf')): d = Document(filename) meta = d.parse_notes() if meta['author']: if verbose: ff = ' file://' + filename print print red % ('#' + '_' * len(ff)) print red % ('#' + ff) print print('%s: %s' % (yellow % 'meta', meta['title'])).encode('utf8') print('%s: %s' % (yellow % 'meta', ' ; '.join( meta['author']))).encode('utf8') print try: yield (meta, d, pdfminer(filename)) except Exception: # XXX: silently skips examples which cause pdfminer to throw an # exception. pass
def markup_pdf(filename): """ Apply learned model on a pdf. Creates a image of the first page. """ try: w = load('weights.pkl~') except IOError: print 'failed to load file' w = None pages = [] filename = path(filename) pdf = pdfminer(filename) gs(filename, outdir) pages.append(pdf.pages[0]) if w is not None: for x in pdf.pages[0].items: y = predict(w, {k: 1.0 for k in features(x)}) if y != 'other': x.style['border'] = '2px solid %s' % { 'author': 'magenta', 'title': 'blue' }[y] c = {'author': magenta, 'title': blue}[y] print '%s: %s' % (c % y, x.text) # if we want to draw the first pages of many pdfs on one html document we # have to lie to the items -- tell them they are on pages other than the # first... yoffset = 0 for p in pages: for item in p.items: if hasattr(item, 'yoffset'): item.yoffset += yoffset yoffset += p.height with file(outfile, 'wb') as f: template.render_context(Context(f, pages=pages)) import webbrowser webbrowser.open(f.name)
def markup_pdf(filename): """ Apply learned model on a pdf. Creates a image of the first page. """ try: w = load('weights.pkl~') except IOError: print 'failed to load file' w = None pages = [] filename = path(filename) pdf = pdfminer(filename) gs(filename, outdir) pages.append(pdf.pages[0]) if w is not None: for x in pdf.pages[0].items: y = predict(w, {k: 1.0 for k in features(x)}) if y != 'other': x.style['border'] = '2px solid %s' % {'author': 'magenta', 'title': 'blue'}[y] c = {'author': magenta, 'title': blue}[y] print '%s: %s' % (c % y, x.text) # if we want to draw the first pages of many pdfs on one html document we # have to lie to the items -- tell them they are on pages other than the # first... yoffset = 0 for p in pages: for item in p.items: if hasattr(item, 'yoffset'): item.yoffset += yoffset yoffset += p.height with file(outfile, 'wb') as f: template.render_context(Context(f, pages=pages)) import webbrowser webbrowser.open(f.name)
def data(verbose=True): """ Get a list of skid pdfs which have authors annotated. """ for filename in iterview(CACHE.glob('*.pdf')): d = Document(filename) meta = d.parse_notes() if meta['author']: if verbose: ff = ' file://' + filename print print red % ('#' + '_' *len(ff)) print red % ('#' + ff) print print ('%s: %s' % (yellow % 'meta', meta['title'])).encode('utf8') print ('%s: %s' % (yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8') print yield (meta, d, pdfminer(filename))
def data(verbose=True): """ Get a list of skid pdfs which have authors annotated. """ for filename in iterview(CACHE.glob('*.pdf')): d = Document(filename) meta = d.parse_notes() if meta['author']: if verbose: ff = ' file://' + filename print() print(colors.red % ('#' + '_' *len(ff))) print(colors.red % ('#' + ff)) print() print(('%s: %s' % (colors.yellow % 'meta', meta['title'])).encode('utf8')) print(('%s: %s' % (colors.yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8')) print() try: yield (meta, d, pdfminer(filename)) except Exception: # XXX: silently skips examples which cause pdfminer to throw an # exception. pass