示例#1
0
def markup_pdf(filename):
    """
    Apply learned model on a pdf.

    Creates a image of the first page.
    """

    try:
        w = load('weights.pkl~')
    except IOError:
        print 'failed to load file'
        w = None

    pages = []

    filename = path(filename)

    pdf = pdfminer(filename)

    gs(filename, outdir)
    pages.append(pdf.pages[0])

    if w is not None:
        for x in pdf.pages[0].items:
            y = predict(w, {k: 1.0 for k in features(x)})
            if y != 'other':
                x.style['border'] = '2px solid %s' % {'author': 'magenta', 'title': 'blue'}[y]
                c = {'author': magenta, 'title': blue}[y]
                print '%s: %s' % (c % y, x.text)

    # if we want to draw the first pages of many pdfs on one html document we
    # have to lie to the items -- tell them they are on pages other than the
    # first...
    yoffset = 0
    for p in pages:
        for item in p.items:
            if hasattr(item, 'yoffset'):
                item.yoffset += yoffset
        yoffset += p.height

    with file(outfile, 'wb') as f:
        template.render_context(Context(f, pages=pages))

    import webbrowser
    webbrowser.open(f.name)
示例#2
0
def data(verbose=True):
    """
    Get a list of skid pdfs which have authors annotated.
    """
    for filename in iterview(CACHE.glob('*.pdf')):
        d = Document(filename)
        meta = d.parse_notes()
        if meta['author']:
            if verbose:
                ff = ' file://' + filename
                print
                print red % ('#' + '_' *len(ff))
                print red % ('#' + ff)
                print
                print ('%s: %s' % (yellow % 'meta', meta['title'])).encode('utf8')
                print ('%s: %s' % (yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8')
                print
            yield (meta, d, pdfminer(filename))
示例#3
0
def data(verbose=True):
    """
    Get a list of skid pdfs which have authors annotated.
    """
    for filename in iterview(CACHE.glob('*.pdf')):
        d = Document(filename)
        meta = d.parse_notes()
        if meta['author']:
            if verbose:
                ff = ' file://' + filename
                print()
                print(colors.red % ('#' + '_' *len(ff)))
                print(colors.red % ('#' + ff))
                print()
                print(('%s: %s' % (colors.yellow % 'meta', meta['title'])).encode('utf8'))
                print(('%s: %s' % (colors.yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8'))
                print()
            try:
                yield (meta, d, pdfminer(filename))
            except Exception:
                # XXX: silently skips examples which cause pdfminer to throw an
                # exception.
                pass