Python Document 예제들, skid.add.Document Python 예제들

예제 #1

0

파일 보기

파일: bibkeys.py 프로젝트: afcarl/skid

def dump():

    for f in config.CACHE.files():
        d = Document(f)
        m = d.parse_notes()

        if not m['author']:   # skip skid marks with out annotated authors.
            continue

        author = ' '.join(map(lastname, m['author']))

        title = remove_stopwords(m['title'])
        title = re.findall('\w+', title)

        year = m['year'][-2:]

        title = ' '.join(title)

        author = author.replace('-', ' ')
        title = title.replace('-', ' ')
        year = year.replace('-', ' ')

        key = '%s-%s-%s' % (author, year, title)
        key = key.lower()
        print key.encode('utf8')

예제 #2

0

파일 보기

파일: bibkeys.py 프로젝트: pombredanne/skid

def dump():

    for f in config.CACHE.files():
        d = Document(f)
        m = d.parse_notes()

        if not m['author']:   # skip skid marks with out annotated authors.
            continue

        author = ' '.join(map(lastname, m['author']))

        title = remove_stopwords(m['title'])
        title = re.findall('\w+', title)

        year = m['year'][-2:]

        title = ' '.join(title)

        author = author.replace('-', ' ')
        title = title.replace('-', ' ')
        year = year.replace('-', ' ')

        key = '%s-%s-%s' % (author, year, title)
        key = key.lower()
        print key.encode('utf8')

예제 #3

0

파일 보기

def data(verbose=True):
    """
    Get a list of skid pdfs which have authors annotated.
    """
    for filename in iterview(CACHE.glob('*.pdf')):
        d = Document(filename)
        meta = d.parse_notes()
        if meta['author']:
            if verbose:
                ff = ' file://' + filename
                print
                print red % ('#' + '_' * len(ff))
                print red % ('#' + ff)
                print
                print('%s: %s' %
                      (yellow % 'meta', meta['title'])).encode('utf8')
                print('%s: %s' % (yellow % 'meta', ' ; '.join(
                    meta['author']))).encode('utf8')
                print
            try:
                yield (meta, d, pdfminer(filename))
            except Exception:
                # XXX: silently skips examples which cause pdfminer to throw an
                # exception.
                pass

예제 #4

0

파일 보기

파일: __main__.py 프로젝트: timvieira/skid

def authors():

    def simplify(x):
        # simplify name: remove single initial, lowercase, convert to ascii
        return re.sub(r'\b[a-z]\.\s*', '', x.strip().lower()).encode('ascii', 'ignore').decode('ascii')

    ix = defaultdict(list)
    docs = []  # documents with authors annotated

    collisions = defaultdict(set)

    for filename in config.CACHE.glob('*.pdf'):
        d = Document(filename)
        d.meta = d.parse_notes()
        A = d.meta['author']
        if A:
            docs.append(d)
            for x in A:
                ix[simplify(x)].append(d)
                collisions[simplify(x)].add(x)

    for a, ds in sorted(list(ix.items()), key=lambda x: len(x[1]), reverse=True):
        print(colors.yellow % '%s (%s)' % (a, len(ds)))
        for d in ds:
            print(' ', d.meta['title'], colors.magenta % ('(file://%s)' % d.cached))

예제 #5

0

파일 보기

파일: __main__.py 프로젝트: timvieira/skid

def todoc(d):
    if isinstance(d, Hit):
        doc = Document(d['cached'])
        doc.score = d.score
        doc.hit = d
        # very slow...
        #doc.highlights = re.sub('<b class="match.*?>([\w\W]+?)</b>',
        #                        r'\033[31m\1\033[0m',
        #                        d.highlights('text', top=3)).replace('\n', ' ') + '\n'
        return doc
    return d

예제 #6

0

파일 보기

def data():
    ix = defaultdict(list)
    docs = []  # documents with authors annotated

    for filename in CACHE.glob('*.pdf'):
        d = Document(filename)
        d.meta = d.parse_notes()
        authors = d.meta['author']
        if authors:
            docs.append(d)
            for x in authors:
                ix[simplify(x)].append(d)

    return ix, docs

예제 #7

0

파일 보기

파일: authors.py 프로젝트: timvieira/skid

def data():
    ix = defaultdict(list)
    docs = []  # documents with authors annotated

    for filename in CACHE.glob('*.pdf'):
        d = Document(filename)
        d.meta = d.parse_notes()
        authors = d.meta['author']
        if authors:
            docs.append(d)
            for x in authors:
                ix[simplify(x)].append(d)

    return ix, docs

예제 #8

0

파일 보기

def main(filename):

    ix = defaultdict(list)
    docs = []  # documents with authors annotated

    for cached in config.CACHE.glob('*.pdf'):
        d = Document(cached)
        d.meta = d.parse_notes()
        authors = d.meta['author']
        if authors:
            docs.append(d)
            for x in authors:
                ix[x].append(d)

    hits = defaultdict(list)

    def hit(m):
        name = m.group(1)
        link = '%s' % hit.id
        hits[name].append(link)
        hit.id += 1
        return r'<a name="{link}" style="background-color: red; color: white;">{name}</a>'.format(
            name=name, link=link)

    hit.id = 0

    if filename.startswith('http'):
        from arsenal.web.download import urlread
        [_, _, content] = urlread(filename)
    else:
        content = file(filename).read()

    out = re.sub(
        '(%s)' % '|'.join(sorted(ix.keys(), key=lambda x: (len(x), x))), hit,
        content.decode('ascii', 'ignore'))

    stuff = '<br/>'.join('%s: %s' % (name, ' '.join('<a href="#%s">%s</a>' %
                                                    (l, l) for l in links))
                         for name, links in sorted(hits.items()))

    sty = 'border: thin solid #000; width: 300px; top: 10px; right: 10px; position: absolute; z-index: 100; background-color: white; padding: 10px;'
    stuff = '<div style="%s">%s</div>' % (sty, stuff)

    out = re.sub('(<body.*?>)', r'\1 %s' % stuff, out)

    if os.path.exists(filename):
        with file('skid-' + filename, 'wb') as f:
            f.write(out)

    browser(out)

예제 #9

0

파일 보기

파일: __main__.py 프로젝트: timvieira/skid

def tags():
    ix = defaultdict(list)

    for filename in config.CACHE.glob('*.pdf'):
        d = Document(filename)
        d.meta = d.parse_notes()
        T = d.meta['tags']
        if T:
            for x in T:
                ix[x.lower()].append(d)

    for tag, ds in sorted(list(ix.items()), key=lambda x: len(x[1]), reverse=True):
        print(colors.yellow % '%s (%s)' % (tag, len(ds)))
        for d in ds:
            print(' ', d.meta['title'], colors.magenta % ('(file://%s)' % (d.cached + '.d/notes.org')))

예제 #10

0

파일 보기

def update():
    "Update index."

    # create index if it doesn't exist
    if not DIRECTORY.exists():
        create()

    # get handle to Whoosh index
    ix = open_dir(DIRECTORY, NAME)

    with ix.writer() as w, ix.searcher() as searcher:

        # sort cached files by mtime.
        files = [Document(f) for f in CACHE.files()]
        files.sort(key=(lambda x: x.modified), reverse=True)

        for d in files:

            # lookup document mtime in the index; don't add or extract info if
            # you don't need it.
            result = searcher.find('cached', unicode(d.cached))

            if not result:
                print '[INFO] new document', d.cached

            else:
                assert len(result) == 1, 'cached should be unique.'
                result = result[0]
                if d.modified <= result['mtime']:  # already up to date

                    # Since we've sorted files by mtime, we know that files
                    # after this one are older, and thus we're done.
                    return

                print '[INFO] update to existing document:', d.cached

            meta = d.parse_notes()

            # just a lint check
            assert meta['cached'] == d.cached, \
                'Cached field in notes (%s) ' \
                'does not match associated file (%s) ' \
                'in notes file %r' % (meta['cached'],
                                      d.cached,
                                      'file://' + d.d/'notes.org')

            # TODO: consider using two fields: display name and searchable
            # name. to avoid the issues with accents

            w.update_document(source=meta['source'],
                              cached=unicode(d.cached),
                              hash=d.hash(),
                              title=meta['title'],
                              author=u' ; '.join(meta['author']),
                              year=meta['year'],
                              notes=meta['notes'],
                              text=d.text(),
                              mtime=d.modified,
                              added=d.added,
                              tags=u' '.join(meta['tags']))

예제 #11

0

파일 보기

파일: skid-find-authors.py 프로젝트: timvieira/skid

def main(filename):

    ix = defaultdict(list)
    docs = []  # documents with authors annotated

    for cached in config.CACHE.glob('*.pdf'):
        d = Document(cached)
        d.meta = d.parse_notes()
        authors = d.meta['author']
        if authors:
            docs.append(d)
            for x in authors:
                ix[x].append(d)

    hits = defaultdict(list)

    def hit(m):
        name = m.group(1)
        link = '%s' % hit.id
        hits[name].append(link)
        hit.id += 1
        return r'<a name="{link}" style="background-color: red; color: white;">{name}</a>'.format(name=name, link=link)

    hit.id = 0

    if filename.startswith('http'):
        from arsenal.download import urlread
        [_,_,content] = urlread(filename)
    else:
        content = open(filename).read()

    out = re.sub('(%s)' % '|'.join(sorted(ix.keys(), key=lambda x: (len(x), x))),
                 hit,
                 content.decode('ascii','ignore'))

    stuff = '<br/>'.join('%s: %s' % (name, ' '.join('<a href="#%s">%s</a>' % (l,l) for l in links)) for name, links in sorted(hits.items()))

    sty = 'border: thin solid #000; width: 300px; top: 10px; right: 10px; position: absolute; z-index: 100; background-color: white; padding: 10px;'
    stuff = '<div style="%s">%s</div>' % (sty, stuff)

    out = re.sub('(<body.*?>)', r'\1 %s' % stuff, out)

    if os.path.exists(filename):
        with open('skid-' + filename, 'wb') as f:
            f.write(out)

    browser(out)

예제 #12

0

파일 보기

def authors_set():
    from skid import config
    from skid.add import Document, SkidError
    A = defaultdict(set)
    for filename in config.CACHE.glob('*.pdf'):
        try:
            d = Document(filename)
            meta = d.parse_notes()
            authors = meta['author']
        except SkidError:
            # throws SkidError if notes file doesn't exist, which will happen if
            # we're in the middle of adding a file.
            continue
        if authors:
            for x in authors:
                A[simplify(x)].add(x)
    return A

예제 #13

0

파일 보기

파일: pdfmill.py 프로젝트: afcarl/skid

def authors_set():
    from skid import config
    from skid.add import Document, SkidError
    A = defaultdict(set)
    for filename in config.CACHE.glob('*.pdf'):
        try:
            d = Document(filename)
            meta = d.parse_notes()
            authors = meta['author']
        except SkidError:
            # throws SkidError if notes file doesn't exist, which will happen if
            # we're in the middle of adding a file.
            continue
        if authors:
            for x in authors:
                A[simplify(x)].add(x)
    return A

예제 #14

0

파일 보기

파일: skid-data.py 프로젝트: pombredanne/skid

def data(verbose=True):
    """
    Get a list of skid pdfs which have authors annotated.
    """
    for filename in iterview(CACHE.glob('*.pdf')):
        d = Document(filename)
        meta = d.parse_notes()
        if meta['author']:
            if verbose:
                ff = ' file://' + filename
                print
                print red % ('#' + '_' *len(ff))
                print red % ('#' + ff)
                print
                print ('%s: %s' % (yellow % 'meta', meta['title'])).encode('utf8')
                print ('%s: %s' % (yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8')
                print
            yield (meta, d, pdfminer(filename))

예제 #15

0

파일 보기

파일: skid-data.py 프로젝트: timvieira/skid

def data(verbose=True):
    """
    Get a list of skid pdfs which have authors annotated.
    """
    for filename in iterview(CACHE.glob('*.pdf')):
        d = Document(filename)
        meta = d.parse_notes()
        if meta['author']:
            if verbose:
                ff = ' file://' + filename
                print()
                print(colors.red % ('#' + '_' *len(ff)))
                print(colors.red % ('#' + ff))
                print()
                print(('%s: %s' % (colors.yellow % 'meta', meta['title'])).encode('utf8'))
                print(('%s: %s' % (colors.yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8'))
                print()
            try:
                yield (meta, d, pdfminer(filename))
            except Exception:
                # XXX: silently skips examples which cause pdfminer to throw an
                # exception.
                pass

예제 #16

0

파일 보기

파일: patch-date-added.py 프로젝트: timvieira/skid

"""
Quick fix to add an approximation (mtime) to date added before we tracked it.
"""

from skid import config
from skid.add import Document

from datetime import datetime

for f in config.CACHE.files():

    d = Document(f)
    mtime = str(datetime.fromtimestamp((f + '.d').mtime))

    # won't overwrite
    d.store('data/date-added', mtime, overwrite=False)

예제 #17

0

파일 보기

파일: __main__.py 프로젝트: pombredanne/skid

def main():

    if len(sys.argv) <= 1:
        print ', '.join(sorted(CMDS))
        return

    cmd = sys.argv.pop(1)

    if cmd in (SEARCH, LS, SIMILAR, KEY):

        p = ArgumentParser()
        p.add_argument('query', nargs='*')
        p.add_argument('--limit', type=int, default=0, #config.LIMIT,
                       help='query limit (use 0 for no limit)')
        p.add_argument('--show', default='', help='display options')
        p.add_argument('--hide', default='', help='display options')
        p.add_argument('--pager', choices=('none', 'less', 'emacs'), default='less',
                       help='pager for results')
        p.add_argument('--format', choices=('standard', 'org'), default='standard',
                       help='output format')
        p.add_argument('--by', choices=('relevance', 'modified', 'added'), default='relevance',
                       help='Sort results by')
        p.add_argument('--top', action='store_true',
                       help='Only show top hit.')
        p.add_argument('--no-open', action='store_false',
                       help='do not open top hit')

        args = p.parse_args()

        query = ' '.join(args.query)

        limit = args.limit if args.limit > 0 else None

        if args.top:
            args.pager = 'none'
            limit = 1

        if cmd == SEARCH:
            results = index.search(query)

        elif cmd == KEY:
            # Supports bibtex key search, e.g. 'bottou12counterfactual'
            #
            #  Example key
            #
            #   'bottou12counterfactual'
            #   -> 'author:bottou year:2012 title:counterfactual'
            #
            #   - should be greedy e.g. act like '--top'
            #
            #   - bash completion for keys should be easy to implement and useful.
            #
            p = bibkey(query)
            if p:
                # TODO: this version doesn't search for papers where author is first-author
                q = ' '.join('%s:%s' % (k,v) for (k,v) in zip(['author', 'year', 'title'], p) if v)
                print q
                results = index.search(q)
            else:
                results = []

        elif cmd == SIMILAR:
            results = Document(query).similar(limit=limit)
        elif cmd == LS:
            results = ls(query)
        else:
            assert False, 'Unrecognized command %s' % cmd

        # convert results to list and convert Whoosh.searching.Hit to skid.Document
        results = list(map(todoc, results))

        # sort documents according to '--by' criteria'
        sortwith = {'relevance': score, 'modified': modified, 'added': added}[args.by]
        if cmd == LS and args.by == 'relevance':
            sortwith = added
        results.sort(key=sortwith, reverse=True)

        nresults = len(results)

        # limit number of search results
        results = results[:limit]

        if args.format == 'org':
            fmt = org
        else:
            fmt = display

        # process display options
        show = {'author', 'title', 'link', 'link:notes'}   # defaults
        show.update(x.strip() for x in args.show.split(','))
        for x in (x.strip() for x in args.hide.split(',')):
            if x in show:
                show.remove(x)

        with pager(args.pager):
            if limit and len(results) >= limit:
                if args.format == 'org':
                    print '# showing top %s of %s results' % (min(limit, nresults), nresults)
                else:
                    print yellow % 'showing top %s of %s results' % (min(limit, nresults), nresults)
            fmt(results, show=show)

        if args.top:
            assert len(results) <= 1
            if not results:
                print red % 'Nothing found'
                return
            [top] = results
            # open cached document and user notes
#            os.system('gnome-open %s' % top.cached)
            if args.no_open:
                from subprocess import Popen
                Popen(['gnome-open', top.cached])
#            os.system('$EDITOR %s' % top.cached + '.d/notes.org')

    elif cmd == ADD:
        p = ArgumentParser()
        p.add_argument('source')
        args = p.parse_args()
        add(args.source)

    elif cmd == RM:
        p = ArgumentParser()
        p.add_argument('cached')
        args = p.parse_args()
        rm(args.cached)

    elif cmd == UPDATE:
        update()

    elif cmd == PUSH:
        push()

    elif cmd == AUTHORS:
        authors()

    elif cmd == TAGS:
        tags()

    elif cmd == LEXICON:
        p = ArgumentParser()
        p.add_argument('field')
        args = p.parse_args()
        lexicon(args.field)

    elif cmd == TITLE:
        # doesn't require adding the document, just finds the title.
        from skid.pdfhacks.pdfmill import extract_title
        p = ArgumentParser()
        p.add_argument('pdf')
        p.add_argument('--no-extra', action='store_false', dest='extra')
        args = p.parse_args()
        extract_title(args.pdf, extra=args.extra)

    else:
        print ', '.join(sorted(CMDS))

예제 #18

0

파일 보기

파일: scholarly.py 프로젝트: timvieira/skid

from skid import config
from skid.add import Document

from skid.utils.gscholar import query
from arsenal.terminal import colors
from random import shuffle

files = config.CACHE.files()

shuffle(files)

for f in files:

    if not f.endswith('.pdf'):
        continue

    d = Document(f)

    meta = d.parse_notes()

    print(colors.green % ('file://' + d.cached))
    print(colors.yellow % meta['title'])
    print(colors.yellow % ' ; '.join(meta['author']))

    results = query(meta['title'])
    print(len(results), 'results')
    for x in results:
        print(x)

    break

예제 #19

0

파일 보기

파일: __main__.py 프로젝트: timvieira/skid

def main():

    if len(sys.argv) <= 1:
        print(', '.join(sorted(cmd.ALL)))
        return

    command = sys.argv.pop(1)

    if command in (cmd.search, cmd.ls, cmd.similar, cmd.key):

        p = ArgumentParser()
        p.add_argument('query', nargs='*')
        p.add_argument('--limit', type=int, default=0, #config.LIMIT,
                       help='query limit (use 0 for no limit)')
        p.add_argument('--show', default='', help='display options')
        p.add_argument('--hide', default='', help='display options')

        # TODO: pager temporarily disabled because of transition to python3
        p.add_argument('--pager', choices=('none', 'less', 'emacs'), default='less',
                       help='pager for results')

        p.add_argument('--format', choices=('standard', 'org'), default='standard',
                       help='output format')
        p.add_argument('--by', choices=('relevance', 'modified', 'added'), default='relevance',
                       help='Sort results by')
        p.add_argument('--top', action='store_true',
                       help='Only show top hit.')
        p.add_argument('--no-open', action='store_false',
                       help='do not open top hit')
        p.add_argument('--note', action='store_true',
                       help='Open note for top hit in editor.')

        args = p.parse_args()

        query = ' '.join(args.query)

        limit = args.limit if args.limit > 0 else None

        if args.top:
            args.pager = 'none'
            limit = 1

        if command == cmd.search:
            results = index.search(query)

        elif command == cmd.key:
            # Supports bibtex key search, e.g. 'bottou12counterfactual'
            #
            #  Example key
            #
            #   'bottou12counterfactual'
            #   -> 'author:bottou year:2012 title:counterfactual'
            #
            #   - should be greedy e.g. act like '--top'
            #
            #   - bash completion for keys should be easy to implement and useful.
            #
            p = bibkey(query)
            if p:
                # TODO: this version doesn't search for papers where author is first-author
                q = ' '.join('%s:%s' % (k,v) for (k,v) in zip(['author', 'year', 'title'], p) if v)
                print(q)
                results = index.search(q)
            else:
                results = []

        elif command == cmd.similar:
            results = Document(query).similar(limit=limit)
        elif command == cmd.ls:
            results = ls(query)
        else:
            assert False, 'Unrecognized command %s' % command

        # convert results to list and convert Whoosh.searching.Hit to skid.Document
        results = list(map(todoc, results))

        # sort documents according to '--by' criteria'
        sortwith = {'relevance': score, 'modified': modified, 'added': added}[args.by]
        if command == cmd.ls and args.by == 'relevance':
            sortwith = added
        results.sort(key=sortwith, reverse=True)

        nresults = len(results)

        # limit number of search results
        results = results[:limit]

        if args.format == 'org':
            fmt = org
        else:
            fmt = display

        # process display options
        show = {'author', 'title', 'link', 'link:notes'}   # defaults
        show.update(x.strip() for x in args.show.split(','))
        for x in (x.strip() for x in args.hide.split(',')):
            if x in show:
                show.remove(x)

        with pager(args.pager):
            if limit and len(results) >= limit:
                if args.format == 'org':
                    print('# showing top %s of %s results' % (min(limit, nresults), nresults))
                else:
                    print(colors.yellow % 'showing top %s of %s results' % (min(limit, nresults), nresults))
            fmt(results, show=show)

        if args.top:
            assert len(results) <= 1
            if not results:
                print(colors.red % 'Nothing found')
                return
            [top] = results
            # open top hit
            if args.no_open:
                if args.note:
                    # open user's note in editor
                    os.system('$EDITOR %s' % top.cached + '.d/notes.org')
                else:
                    from subprocess import Popen
                    # open cached document
                    # TODO: read from config file
                    Popen(['xdg-open', top.cached])

    elif command == cmd.add:
        p = ArgumentParser()
        p.add_argument('source')
        p.add_argument('--name')
        args = p.parse_args()
        add(args.source, dest=args.name)

    elif command == cmd.rm:
        p = ArgumentParser()
        p.add_argument('cached')
        args = p.parse_args()
        rm(args.cached)

    elif command == cmd.update:
        index.update()

    elif command == cmd.authors:
        authors()

    elif command == cmd.tags:
        tags()

    elif command == cmd.drop:
        print(colors.yellow % 'Dropping search index... To build a fresh one run\n$ skid update')
        index.drop()

    elif command == cmd.lexicon:
        p = ArgumentParser()
        p.add_argument('field')
        args = p.parse_args()
        lexicon(args.field)

    elif command == cmd.title:
        # doesn't require adding the document, just finds the title.
        from skid.pdfhacks.pdfmill import extract_title
        p = ArgumentParser()
        p.add_argument('pdf')
        p.add_argument('--no-extra', action='store_false', dest='extra')
        args = p.parse_args()
        extract_title(args.pdf, extra=args.extra)

    elif command == cmd.scholar:
        from skid.add import gscholar_bib
        from skid.pdfhacks.pdfmill import extract_title
        p = ArgumentParser()
        p.add_argument('pdf')
        p.add_argument('--no-extra', action='store_false', dest='extra')
        args = p.parse_args()

        # run google scholar search based on extracted title.
        title = extract_title(args.pdf, extra=args.extra)
        gscholar_bib(title=title)

    else:
        print(', '.join(sorted(cmd.ALL)))

예제 #20

0

파일 보기

파일: __main__.py 프로젝트: timvieira/skid

def ls(q, **kwargs):
    "List recent files."
    for f in config.CACHE.files():
        if q in f:
            yield Document(f)