def dump(): for f in config.CACHE.files(): d = Document(f) m = d.parse_notes() if not m['author']: # skip skid marks with out annotated authors. continue author = ' '.join(map(lastname, m['author'])) title = remove_stopwords(m['title']) title = re.findall('\w+', title) year = m['year'][-2:] title = ' '.join(title) author = author.replace('-', ' ') title = title.replace('-', ' ') year = year.replace('-', ' ') key = '%s-%s-%s' % (author, year, title) key = key.lower() print key.encode('utf8')
def data(verbose=True): """ Get a list of skid pdfs which have authors annotated. """ for filename in iterview(CACHE.glob('*.pdf')): d = Document(filename) meta = d.parse_notes() if meta['author']: if verbose: ff = ' file://' + filename print print red % ('#' + '_' * len(ff)) print red % ('#' + ff) print print('%s: %s' % (yellow % 'meta', meta['title'])).encode('utf8') print('%s: %s' % (yellow % 'meta', ' ; '.join( meta['author']))).encode('utf8') print try: yield (meta, d, pdfminer(filename)) except Exception: # XXX: silently skips examples which cause pdfminer to throw an # exception. pass
def authors(): def simplify(x): # simplify name: remove single initial, lowercase, convert to ascii return re.sub(r'\b[a-z]\.\s*', '', x.strip().lower()).encode('ascii', 'ignore').decode('ascii') ix = defaultdict(list) docs = [] # documents with authors annotated collisions = defaultdict(set) for filename in config.CACHE.glob('*.pdf'): d = Document(filename) d.meta = d.parse_notes() A = d.meta['author'] if A: docs.append(d) for x in A: ix[simplify(x)].append(d) collisions[simplify(x)].add(x) for a, ds in sorted(list(ix.items()), key=lambda x: len(x[1]), reverse=True): print(colors.yellow % '%s (%s)' % (a, len(ds))) for d in ds: print(' ', d.meta['title'], colors.magenta % ('(file://%s)' % d.cached))
def todoc(d): if isinstance(d, Hit): doc = Document(d['cached']) doc.score = d.score doc.hit = d # very slow... #doc.highlights = re.sub('<b class="match.*?>([\w\W]+?)</b>', # r'\033[31m\1\033[0m', # d.highlights('text', top=3)).replace('\n', ' ') + '\n' return doc return d
def data(): ix = defaultdict(list) docs = [] # documents with authors annotated for filename in CACHE.glob('*.pdf'): d = Document(filename) d.meta = d.parse_notes() authors = d.meta['author'] if authors: docs.append(d) for x in authors: ix[simplify(x)].append(d) return ix, docs
def main(filename): ix = defaultdict(list) docs = [] # documents with authors annotated for cached in config.CACHE.glob('*.pdf'): d = Document(cached) d.meta = d.parse_notes() authors = d.meta['author'] if authors: docs.append(d) for x in authors: ix[x].append(d) hits = defaultdict(list) def hit(m): name = m.group(1) link = '%s' % hit.id hits[name].append(link) hit.id += 1 return r'<a name="{link}" style="background-color: red; color: white;">{name}</a>'.format( name=name, link=link) hit.id = 0 if filename.startswith('http'): from arsenal.web.download import urlread [_, _, content] = urlread(filename) else: content = file(filename).read() out = re.sub( '(%s)' % '|'.join(sorted(ix.keys(), key=lambda x: (len(x), x))), hit, content.decode('ascii', 'ignore')) stuff = '<br/>'.join('%s: %s' % (name, ' '.join('<a href="#%s">%s</a>' % (l, l) for l in links)) for name, links in sorted(hits.items())) sty = 'border: thin solid #000; width: 300px; top: 10px; right: 10px; position: absolute; z-index: 100; background-color: white; padding: 10px;' stuff = '<div style="%s">%s</div>' % (sty, stuff) out = re.sub('(<body.*?>)', r'\1 %s' % stuff, out) if os.path.exists(filename): with file('skid-' + filename, 'wb') as f: f.write(out) browser(out)
def tags(): ix = defaultdict(list) for filename in config.CACHE.glob('*.pdf'): d = Document(filename) d.meta = d.parse_notes() T = d.meta['tags'] if T: for x in T: ix[x.lower()].append(d) for tag, ds in sorted(list(ix.items()), key=lambda x: len(x[1]), reverse=True): print(colors.yellow % '%s (%s)' % (tag, len(ds))) for d in ds: print(' ', d.meta['title'], colors.magenta % ('(file://%s)' % (d.cached + '.d/notes.org')))
def update(): "Update index." # create index if it doesn't exist if not DIRECTORY.exists(): create() # get handle to Whoosh index ix = open_dir(DIRECTORY, NAME) with ix.writer() as w, ix.searcher() as searcher: # sort cached files by mtime. files = [Document(f) for f in CACHE.files()] files.sort(key=(lambda x: x.modified), reverse=True) for d in files: # lookup document mtime in the index; don't add or extract info if # you don't need it. result = searcher.find('cached', unicode(d.cached)) if not result: print '[INFO] new document', d.cached else: assert len(result) == 1, 'cached should be unique.' result = result[0] if d.modified <= result['mtime']: # already up to date # Since we've sorted files by mtime, we know that files # after this one are older, and thus we're done. return print '[INFO] update to existing document:', d.cached meta = d.parse_notes() # just a lint check assert meta['cached'] == d.cached, \ 'Cached field in notes (%s) ' \ 'does not match associated file (%s) ' \ 'in notes file %r' % (meta['cached'], d.cached, 'file://' + d.d/'notes.org') # TODO: consider using two fields: display name and searchable # name. to avoid the issues with accents w.update_document(source=meta['source'], cached=unicode(d.cached), hash=d.hash(), title=meta['title'], author=u' ; '.join(meta['author']), year=meta['year'], notes=meta['notes'], text=d.text(), mtime=d.modified, added=d.added, tags=u' '.join(meta['tags']))
def main(filename): ix = defaultdict(list) docs = [] # documents with authors annotated for cached in config.CACHE.glob('*.pdf'): d = Document(cached) d.meta = d.parse_notes() authors = d.meta['author'] if authors: docs.append(d) for x in authors: ix[x].append(d) hits = defaultdict(list) def hit(m): name = m.group(1) link = '%s' % hit.id hits[name].append(link) hit.id += 1 return r'<a name="{link}" style="background-color: red; color: white;">{name}</a>'.format(name=name, link=link) hit.id = 0 if filename.startswith('http'): from arsenal.download import urlread [_,_,content] = urlread(filename) else: content = open(filename).read() out = re.sub('(%s)' % '|'.join(sorted(ix.keys(), key=lambda x: (len(x), x))), hit, content.decode('ascii','ignore')) stuff = '<br/>'.join('%s: %s' % (name, ' '.join('<a href="#%s">%s</a>' % (l,l) for l in links)) for name, links in sorted(hits.items())) sty = 'border: thin solid #000; width: 300px; top: 10px; right: 10px; position: absolute; z-index: 100; background-color: white; padding: 10px;' stuff = '<div style="%s">%s</div>' % (sty, stuff) out = re.sub('(<body.*?>)', r'\1 %s' % stuff, out) if os.path.exists(filename): with open('skid-' + filename, 'wb') as f: f.write(out) browser(out)
def authors_set(): from skid import config from skid.add import Document, SkidError A = defaultdict(set) for filename in config.CACHE.glob('*.pdf'): try: d = Document(filename) meta = d.parse_notes() authors = meta['author'] except SkidError: # throws SkidError if notes file doesn't exist, which will happen if # we're in the middle of adding a file. continue if authors: for x in authors: A[simplify(x)].add(x) return A
def data(verbose=True): """ Get a list of skid pdfs which have authors annotated. """ for filename in iterview(CACHE.glob('*.pdf')): d = Document(filename) meta = d.parse_notes() if meta['author']: if verbose: ff = ' file://' + filename print print red % ('#' + '_' *len(ff)) print red % ('#' + ff) print print ('%s: %s' % (yellow % 'meta', meta['title'])).encode('utf8') print ('%s: %s' % (yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8') print yield (meta, d, pdfminer(filename))
def data(verbose=True): """ Get a list of skid pdfs which have authors annotated. """ for filename in iterview(CACHE.glob('*.pdf')): d = Document(filename) meta = d.parse_notes() if meta['author']: if verbose: ff = ' file://' + filename print() print(colors.red % ('#' + '_' *len(ff))) print(colors.red % ('#' + ff)) print() print(('%s: %s' % (colors.yellow % 'meta', meta['title'])).encode('utf8')) print(('%s: %s' % (colors.yellow % 'meta', ' ; '.join(meta['author']))).encode('utf8')) print() try: yield (meta, d, pdfminer(filename)) except Exception: # XXX: silently skips examples which cause pdfminer to throw an # exception. pass
""" Quick fix to add an approximation (mtime) to date added before we tracked it. """ from skid import config from skid.add import Document from datetime import datetime for f in config.CACHE.files(): d = Document(f) mtime = str(datetime.fromtimestamp((f + '.d').mtime)) # won't overwrite d.store('data/date-added', mtime, overwrite=False)
def main(): if len(sys.argv) <= 1: print ', '.join(sorted(CMDS)) return cmd = sys.argv.pop(1) if cmd in (SEARCH, LS, SIMILAR, KEY): p = ArgumentParser() p.add_argument('query', nargs='*') p.add_argument('--limit', type=int, default=0, #config.LIMIT, help='query limit (use 0 for no limit)') p.add_argument('--show', default='', help='display options') p.add_argument('--hide', default='', help='display options') p.add_argument('--pager', choices=('none', 'less', 'emacs'), default='less', help='pager for results') p.add_argument('--format', choices=('standard', 'org'), default='standard', help='output format') p.add_argument('--by', choices=('relevance', 'modified', 'added'), default='relevance', help='Sort results by') p.add_argument('--top', action='store_true', help='Only show top hit.') p.add_argument('--no-open', action='store_false', help='do not open top hit') args = p.parse_args() query = ' '.join(args.query) limit = args.limit if args.limit > 0 else None if args.top: args.pager = 'none' limit = 1 if cmd == SEARCH: results = index.search(query) elif cmd == KEY: # Supports bibtex key search, e.g. 'bottou12counterfactual' # # Example key # # 'bottou12counterfactual' # -> 'author:bottou year:2012 title:counterfactual' # # - should be greedy e.g. act like '--top' # # - bash completion for keys should be easy to implement and useful. # p = bibkey(query) if p: # TODO: this version doesn't search for papers where author is first-author q = ' '.join('%s:%s' % (k,v) for (k,v) in zip(['author', 'year', 'title'], p) if v) print q results = index.search(q) else: results = [] elif cmd == SIMILAR: results = Document(query).similar(limit=limit) elif cmd == LS: results = ls(query) else: assert False, 'Unrecognized command %s' % cmd # convert results to list and convert Whoosh.searching.Hit to skid.Document results = list(map(todoc, results)) # sort documents according to '--by' criteria' sortwith = {'relevance': score, 'modified': modified, 'added': added}[args.by] if cmd == LS and args.by == 'relevance': sortwith = added results.sort(key=sortwith, reverse=True) nresults = len(results) # limit number of search results results = results[:limit] if args.format == 'org': fmt = org else: fmt = display # process display options show = {'author', 'title', 'link', 'link:notes'} # defaults show.update(x.strip() for x in args.show.split(',')) for x in (x.strip() for x in args.hide.split(',')): if x in show: show.remove(x) with pager(args.pager): if limit and len(results) >= limit: if args.format == 'org': print '# showing top %s of %s results' % (min(limit, nresults), nresults) else: print yellow % 'showing top %s of %s results' % (min(limit, nresults), nresults) fmt(results, show=show) if args.top: assert len(results) <= 1 if not results: print red % 'Nothing found' return [top] = results # open cached document and user notes # os.system('gnome-open %s' % top.cached) if args.no_open: from subprocess import Popen Popen(['gnome-open', top.cached]) # os.system('$EDITOR %s' % top.cached + '.d/notes.org') elif cmd == ADD: p = ArgumentParser() p.add_argument('source') args = p.parse_args() add(args.source) elif cmd == RM: p = ArgumentParser() p.add_argument('cached') args = p.parse_args() rm(args.cached) elif cmd == UPDATE: update() elif cmd == PUSH: push() elif cmd == AUTHORS: authors() elif cmd == TAGS: tags() elif cmd == LEXICON: p = ArgumentParser() p.add_argument('field') args = p.parse_args() lexicon(args.field) elif cmd == TITLE: # doesn't require adding the document, just finds the title. from skid.pdfhacks.pdfmill import extract_title p = ArgumentParser() p.add_argument('pdf') p.add_argument('--no-extra', action='store_false', dest='extra') args = p.parse_args() extract_title(args.pdf, extra=args.extra) else: print ', '.join(sorted(CMDS))
from skid import config from skid.add import Document from skid.utils.gscholar import query from arsenal.terminal import colors from random import shuffle files = config.CACHE.files() shuffle(files) for f in files: if not f.endswith('.pdf'): continue d = Document(f) meta = d.parse_notes() print(colors.green % ('file://' + d.cached)) print(colors.yellow % meta['title']) print(colors.yellow % ' ; '.join(meta['author'])) results = query(meta['title']) print(len(results), 'results') for x in results: print(x) break
def main(): if len(sys.argv) <= 1: print(', '.join(sorted(cmd.ALL))) return command = sys.argv.pop(1) if command in (cmd.search, cmd.ls, cmd.similar, cmd.key): p = ArgumentParser() p.add_argument('query', nargs='*') p.add_argument('--limit', type=int, default=0, #config.LIMIT, help='query limit (use 0 for no limit)') p.add_argument('--show', default='', help='display options') p.add_argument('--hide', default='', help='display options') # TODO: pager temporarily disabled because of transition to python3 p.add_argument('--pager', choices=('none', 'less', 'emacs'), default='less', help='pager for results') p.add_argument('--format', choices=('standard', 'org'), default='standard', help='output format') p.add_argument('--by', choices=('relevance', 'modified', 'added'), default='relevance', help='Sort results by') p.add_argument('--top', action='store_true', help='Only show top hit.') p.add_argument('--no-open', action='store_false', help='do not open top hit') p.add_argument('--note', action='store_true', help='Open note for top hit in editor.') args = p.parse_args() query = ' '.join(args.query) limit = args.limit if args.limit > 0 else None if args.top: args.pager = 'none' limit = 1 if command == cmd.search: results = index.search(query) elif command == cmd.key: # Supports bibtex key search, e.g. 'bottou12counterfactual' # # Example key # # 'bottou12counterfactual' # -> 'author:bottou year:2012 title:counterfactual' # # - should be greedy e.g. act like '--top' # # - bash completion for keys should be easy to implement and useful. # p = bibkey(query) if p: # TODO: this version doesn't search for papers where author is first-author q = ' '.join('%s:%s' % (k,v) for (k,v) in zip(['author', 'year', 'title'], p) if v) print(q) results = index.search(q) else: results = [] elif command == cmd.similar: results = Document(query).similar(limit=limit) elif command == cmd.ls: results = ls(query) else: assert False, 'Unrecognized command %s' % command # convert results to list and convert Whoosh.searching.Hit to skid.Document results = list(map(todoc, results)) # sort documents according to '--by' criteria' sortwith = {'relevance': score, 'modified': modified, 'added': added}[args.by] if command == cmd.ls and args.by == 'relevance': sortwith = added results.sort(key=sortwith, reverse=True) nresults = len(results) # limit number of search results results = results[:limit] if args.format == 'org': fmt = org else: fmt = display # process display options show = {'author', 'title', 'link', 'link:notes'} # defaults show.update(x.strip() for x in args.show.split(',')) for x in (x.strip() for x in args.hide.split(',')): if x in show: show.remove(x) with pager(args.pager): if limit and len(results) >= limit: if args.format == 'org': print('# showing top %s of %s results' % (min(limit, nresults), nresults)) else: print(colors.yellow % 'showing top %s of %s results' % (min(limit, nresults), nresults)) fmt(results, show=show) if args.top: assert len(results) <= 1 if not results: print(colors.red % 'Nothing found') return [top] = results # open top hit if args.no_open: if args.note: # open user's note in editor os.system('$EDITOR %s' % top.cached + '.d/notes.org') else: from subprocess import Popen # open cached document # TODO: read from config file Popen(['xdg-open', top.cached]) elif command == cmd.add: p = ArgumentParser() p.add_argument('source') p.add_argument('--name') args = p.parse_args() add(args.source, dest=args.name) elif command == cmd.rm: p = ArgumentParser() p.add_argument('cached') args = p.parse_args() rm(args.cached) elif command == cmd.update: index.update() elif command == cmd.authors: authors() elif command == cmd.tags: tags() elif command == cmd.drop: print(colors.yellow % 'Dropping search index... To build a fresh one run\n$ skid update') index.drop() elif command == cmd.lexicon: p = ArgumentParser() p.add_argument('field') args = p.parse_args() lexicon(args.field) elif command == cmd.title: # doesn't require adding the document, just finds the title. from skid.pdfhacks.pdfmill import extract_title p = ArgumentParser() p.add_argument('pdf') p.add_argument('--no-extra', action='store_false', dest='extra') args = p.parse_args() extract_title(args.pdf, extra=args.extra) elif command == cmd.scholar: from skid.add import gscholar_bib from skid.pdfhacks.pdfmill import extract_title p = ArgumentParser() p.add_argument('pdf') p.add_argument('--no-extra', action='store_false', dest='extra') args = p.parse_args() # run google scholar search based on extracted title. title = extract_title(args.pdf, extra=args.extra) gscholar_bib(title=title) else: print(', '.join(sorted(cmd.ALL)))
def ls(q, **kwargs): "List recent files." for f in config.CACHE.files(): if q in f: yield Document(f)