def papers_from_svm(recent_days=None): out = [] if g.user: uid = session["user_id"] if not uid in user_sim: return [] # we want to exclude papers that are already in user library from the result, so fetch them. user_library = query_db("""select * from library where user_id = ?""", [uid]) libids = {strip_version(x["paper_id"]) for x in user_library} plist = user_sim[uid] out = [db[x] for x in plist if not x in libids] if recent_days is not None: # filter as well to only most recent papers curtime = int(time.time()) # in seconds out = [ x for x in out if curtime - x["time_published"] < recent_days * 24 * 60 * 60 ] return out
def review(): """ user wants to toggle a paper in his library """ # make sure user is logged in if not g.user: return "NO" # fail... (not logged in). JS should prevent from us getting here. idvv = request.form["pid"] # includes version if not isvalidid(idvv): return "NO" # fail, malformed id. weird. pid = strip_version(idvv) if not pid in db: return "NO" # we don't know this paper. wat uid = session["user_id"] # id of logged in user # check this user already has this paper in library record = query_db( """select * from library where user_id = ? and paper_id = ?""", [uid, pid], one=True, ) print(record) ret = "NO" if record: # record exists, erase it. g.db.execute( """delete from library where user_id = ? and paper_id = ?""", [uid, pid] ) g.db.commit() # print('removed %s for %s' % (pid, uid)) ret = "OFF" else: # record does not exist, add it. rawpid = strip_version(pid) g.db.execute( """insert into library (paper_id, user_id, update_time) values (?, ?, ?)""", [rawpid, uid, int(time.time())], ) g.db.commit() # print('added %s for %s' % (pid, uid)) ret = "ON" return ret
def papers_from_library(): out = [] if g.user: # user is logged in, lets fetch their saved library data uid = session["user_id"] user_library = query_db("""select * from library where user_id = ?""", [uid]) libids = [strip_version(x["paper_id"]) for x in user_library] out = [db[x] for x in libids if x in db] out = sorted(out, key=lambda k: k["updated"], reverse=True) return out
def papers_from_library(): out = [] if g.user: # user is logged in, lets fetch their saved library data uid = session['user_id'] user_library = query_db('''select * from library where user_id = ?''', [uid]) libids = [strip_version(x['paper_id']) for x in user_library] out = [db[x] for x in libids] out = sorted(out, key=lambda k: k['updated'], reverse=True) return out
def encode_json(ps, n=10, send_images=True, send_abstracts=True): libids = set() if g.user: # user is logged in, lets fetch their saved library data uid = session["user_id"] user_library = query_db("""select * from library where user_id = ?""", [uid]) libids = {strip_version(x["paper_id"]) for x in user_library} ret = [] for i in range(min(len(ps), n)): p = ps[i] idvv = "%sv%d" % (p["_rawid"], p["_version"]) struct = {} struct["title"] = p["title"] struct["pid"] = idvv struct["rawpid"] = p["_rawid"] struct["category"] = p["arxiv_primary_category"]["term"] struct["authors"] = [a["name"] for a in p["authors"]] struct["link"] = p["link"] struct["in_library"] = 1 if p["_rawid"] in libids else 0 if send_abstracts: struct["abstract"] = p["summary"] if send_images: struct["img"] = "/static/thumbs/" + idvv + ".pdf.jpg" struct["tags"] = [t["term"] for t in p["tags"]] # render time information nicely timestruct = dateutil.parser.parse(p["updated"]) struct["published_time"] = "%s/%s/%s" % ( timestruct.month, timestruct.day, timestruct.year, ) timestruct = dateutil.parser.parse(p["published"]) struct["originally_published_time"] = "%s/%s/%s" % ( timestruct.month, timestruct.day, timestruct.year, ) # fetch amount of discussion on this paper struct["num_discussion"] = comments.count({"pid": p["_rawid"]}) # arxiv comments from the authors (when they submit the paper) cc = p.get("arxiv_comment", "") if len(cc) > 100: cc = cc[:100] + "..." # crop very long comments struct["comment"] = cc ret.append(struct) return ret
def friends(): ttstr = request.args.get("timefilter", "week") # default is week legend = {"day": 1, "3days": 3, "week": 7, "month": 30, "year": 365} tt = legend.get(ttstr, 7) papers = [] pid_to_users = {} if g.user: # gather all the people we are following username = get_username(session["user_id"]) edges = list(follow_collection.find({"who": username})) # fetch all papers in all of their libraries, and count the top ones counts = {} for edict in edges: whom = edict["whom"] uid = get_user_id(whom) user_library = query_db( """select * from library where user_id = ?""", [uid] ) libids = [strip_version(x["paper_id"]) for x in user_library] for lid in libids: if not lid in counts: counts[lid] = [] counts[lid].append(whom) keys = list(counts.keys()) keys.sort(key=lambda k: len(counts[k]), reverse=True) # descending by count papers = [db[x] for x in keys] # finally filter by date curtime = int(time.time()) # in seconds papers = [ x for x in papers if curtime - x["time_published"] < tt * 24 * 60 * 60 ] # trim at like 100 if len(papers) > 100: papers = papers[:100] # trim counts as well correspondingly pid_to_users = {p["_rawid"]: counts.get(p["_rawid"], []) for p in papers} if not g.user: msg = "You must be logged in and follow some people to enjoy this tab." else: if len(papers) == 0: msg = "No friend papers present. Try to extend the time range, or add friends by clicking on your account name (top, right)" else: msg = "Papers in your friend's libraries:" ctx = default_context( papers, render_format="friends", pid_to_users=pid_to_users, msg=msg ) return render_template("main.html", **ctx)
def papers_similar(pid): rawpid = strip_version(pid) # check if we have this paper at all, otherwise return empty list if not rawpid in db: return [] # check if we have distances to this specific version of paper id (includes version) if pid in sim_dict: # good, simplest case: lets return the papers return [db[strip_version(k)] for k in sim_dict[pid]] else: # ok we don't have this specific version. could be a stale URL that points to, # e.g. v1 of a paper, but due to an updated version of it we only have v2 on file # now. We want to use v2 in that case. # lets try to retrieve the most recent version of this paper we do have kok = [k for k in sim_dict if rawpid in k] if kok: # ok we have at least one different version of this paper, lets use it instead id_use_instead = kok[0] return [db[strip_version(k)] for k in sim_dict[id_use_instead]] else: # return just the paper. we dont have similarities for it for some reason return [db[rawpid]]
def encode_json(ps, n=10, send_images=True, send_abstracts=True): libids = set() if g.user: # user is logged in, lets fetch their saved library data uid = session['user_id'] user_library = query_db('''select * from library where user_id = ?''', [uid]) libids = {strip_version(x['paper_id']) for x in user_library} ret = [] for i in range(min(len(ps), n)): p = ps[i] idvv = '%sv%d' % (p['_rawid'], p['_version']) struct = {} struct['title'] = p['title'] struct['pid'] = idvv struct['rawpid'] = p['_rawid'] struct['category'] = p['arxiv_primary_category']['term'] struct['authors'] = [a['name'] for a in p['authors']] struct['link'] = p['link'] struct['in_library'] = 1 if p['_rawid'] in libids else 0 if send_abstracts: struct['abstract'] = p['summary'] if send_images: struct['img'] = '/static/thumbs/' + idvv + '.pdf.jpg' struct['tags'] = [t['term'] for t in p['tags']] # render time information nicely timestruct = dateutil.parser.parse(p['updated']) struct['published_time'] = '%s/%s/%s' % ( timestruct.month, timestruct.day, timestruct.year) timestruct = dateutil.parser.parse(p['published']) struct['originally_published_time'] = '%s/%s/%s' % ( timestruct.month, timestruct.day, timestruct.year) # fetch amount of discussion on this paper struct['num_discussion'] = comments.count({'pid': p['_rawid']}) # arxiv comments from the authors (when they submit the paper) cc = p.get('arxiv_comment', '') if len(cc) > 100: cc = cc[:100] + '...' # crop very long comments struct['comment'] = cc ret.append(struct) return ret
def friends(): ttstr = request.args.get('timefilter', 'week') # default is week legend = {'day':1, '3days':3, 'week':7, 'month':30, 'year':365} tt = legend.get(ttstr, 7) papers = [] pid_to_users = {} if g.user: # gather all the people we are following username = get_username(session['user_id']) edges = list(follow_collection.find({ 'who':username })) # fetch all papers in all of their libraries, and count the top ones counts = {} for edict in edges: whom = edict['whom'] uid = get_user_id(whom) user_library = query_db('''select * from library where user_id = ?''', [uid]) libids = [strip_version(x['paper_id']) for x in user_library] for lid in libids: if not lid in counts: counts[lid] = [] counts[lid].append(whom) keys = list(counts.keys()) keys.sort(key=lambda k: len(counts[k]), reverse=True) # descending by count papers = [db[x] for x in keys] # finally filter by date curtime = int(time.time()) # in seconds papers = [x for x in papers if curtime - x['time_published'] < tt*24*60*60] # trim at like 100 if len(papers) > 100: papers = papers[:100] # trim counts as well correspondingly pid_to_users = { p['_rawid'] : counts.get(p['_rawid'], []) for p in papers } if not g.user: msg = "You must be logged in and follow some people to enjoy this tab." else: if len(papers) == 0: msg = "No friend papers present. Try to extend the time range, or add friends by clicking on your account name (top, right)" else: msg = "Papers in your friend's libraries:" ctx = default_context(papers, render_format='friends', pid_to_users=pid_to_users, msg=msg) return render_template('main.html', **ctx)
def encode_json(ps, n=10, send_images=True, send_abstracts=True): libids = set() if g.user: # user is logged in, lets fetch their saved library data uid = session['user_id'] user_library = query_db('''select * from library where user_id = ?''', [uid]) libids = {strip_version(x['paper_id']) for x in user_library} ret = [] for i in range(min(len(ps),n)): p = ps[i] idvv = '%sv%d' % (p['_rawid'], p['_version']) struct = {} struct['title'] = p['title'] struct['pid'] = idvv struct['rawpid'] = p['_rawid'] struct['category'] = p['arxiv_primary_category']['term'] struct['authors'] = [a['name'] for a in p['authors']] struct['link'] = p['link'] struct['in_library'] = 1 if p['_rawid'] in libids else 0 if send_abstracts: struct['abstract'] = p['summary'] if send_images: struct['img'] = '/static/thumbs/' + idvv + '.pdf.jpg' struct['tags'] = [t['term'] for t in p['tags']] # render time information nicely timestruct = dateutil.parser.parse(p['updated']) struct['published_time'] = '%s/%s/%s' % (timestruct.month, timestruct.day, timestruct.year) timestruct = dateutil.parser.parse(p['published']) struct['originally_published_time'] = '%s/%s/%s' % (timestruct.month, timestruct.day, timestruct.year) # fetch amount of discussion on this paper struct['num_discussion'] = comments.count({ 'pid': p['_rawid'] }) # arxiv comments from the authors (when they submit the paper) cc = p.get('arxiv_comment', '') if len(cc) > 100: cc = cc[:100] + '...' # crop very long comments struct['comment'] = cc ret.append(struct) return ret
def papers_from_svm(recent_days=None): out = [] if g.user: uid = session['user_id'] if not uid in user_sim: return [] # we want to exclude papers that are already in user library from the result, so fetch them. user_library = query_db('''select * from library where user_id = ?''', [uid]) libids = {strip_version(x['paper_id']) for x in user_library} plist = user_sim[uid] out = [db[x] for x in plist if not x in libids] if recent_days is not None: # filter as well to only most recent papers curtime = int(time.time()) # in seconds out = [x for x in out if curtime - x['time_published'] < recent_days*24*60*60] return out
def papers_similar(pid): rawpid = strip_version(pid) return []
def get_similar(pid): rawpid = strip_version(pid) if pid in sim_dict: # good, simplest case: lets return the papers return [k for k in sim_dict[pid]] return []
return (rv[0] if rv else None) if one else rv # ----------------------------------------------------------------------------- # fetch all users users = query_db('''select * from user''') print('number of users: ', len(users)) # load the tfidf matrix and meta meta = pickle.load(open(Config.meta_path, 'rb')) out = pickle.load(open(Config.tfidf_path, 'rb')) X = out['X'] X = X.todense().astype(np.float32) xtoi = {strip_version(x): i for x, i in meta['ptoi'].items()} user_sim = {} for ii, u in enumerate(users): print("%d/%d building an SVM for %s" % (ii, len(users), u['username'].encode('utf-8'))) uid = u['user_id'] lib = query_db('''select * from library where user_id = ?''', [uid]) pids = [x['paper_id'] for x in lib] # raw pids without version posix = [xtoi[p] for p in pids if p in xtoi] if not posix: continue # empty library for this user maybe? print(pids) y = np.zeros(X.shape[0])
rv = cur.fetchall() return (rv[0] if rv else None) if one else rv # ----------------------------------------------------------------------------- # fetch all users users = query_db('''select * from user''') print('number of users: ', len(users)) # load the tfidf matrix and meta meta = pickle.load(open(Config.meta_path, 'rb')) out = pickle.load(open(Config.tfidf_path, 'rb')) X = out['X'] X = X.todense() xtoi = { strip_version(x):i for x,i in meta['ptoi'].items() } user_sim = {} for ii,u in enumerate(users): print("%d/%d building an SVM for %s" % (ii, len(users), u['username'].encode('utf-8'))) uid = u['user_id'] lib = query_db('''select * from library where user_id = ?''', [uid]) pids = [x['paper_id'] for x in lib] # raw pids without version posix = [xtoi[p] for p in pids if p in xtoi] if not posix: continue # empty library for this user maybe? print(pids) y = np.zeros(X.shape[0]) for ix in posix: y[ix] = 1
def get_similar(pid): pid = strip_version(pid) ID = pids.index(pid) distances, indices = nbrs.kneighbors([X[ID]]) return [pids[i] for i in indices[0]]
def create_html(args, db, pcates): db_list = list(db.keys()) sort_by = date_sort_by(args.date_sort_by).split('_')[0] fname = '{d}-{ds}-{n}-{c}.html'.format(d=args.report_date, ds=sort_by, n=len(db), c='+'.join(pcates)) path = os.path.join(args.report_path, fname) html = open(path, 'w') html.write('<html><head>') html.write('<title>arXiv {ds} {d}</title>'.format(ds=sort_by, d=args.report_date)) html.write('<style type="text/css">') css = add_css() html.write(css) html.write('</style></head><body>') html.write('<a id="top"></a>') # make contents list for i, e in enumerate(db_list): if (i % args.number_break_contents == 0) and (args.printable_format_a4 == 1): html.write( '<page size="A4"><center><h1>Report arXiv {ds} {d}</h1>'. format(ds=sort_by, d=args.report_date)) html.write('<table width="750px">') html.write( '<tr><th>N</th><th>ID</th><th>Title</th><th>P.C.</th></tr>') elif (i == 0) and (args.printable_format_a4 == 0): html.write('<page><center><h1>Report arXiv {ds} {d}</h1>'.format( ds=sort_by, d=args.report_date)) html.write('<table width="750px">') html.write( '<tr><th>N</th><th>ID</th><th>Title</th><th>P.C.</th></tr>') html.write('<tr><td>{}</td>'.format(i + 1)) html.write('<td align="center"><a href="{link}">{id}</a></td>'.format( link=strip_version(db[e]['id']), id=e)) html.write('<td><a href="#{aid}">{t}</a></td>'.format( aid=e, t=db[e]['title'])) html.write('<td>{}</td></tr>'.format( db[e]['arxiv_primary_category']['term'])) if (((i + 1) % args.number_break_contents == 0) or (i == len(db_list) - 1)) and (args.printable_format_a4 == 1): html.write('</table>') html.write('</center></page>') elif (i == len(db_list) - 1) and (args.printable_format_a4 == 0): html.write('</table>') html.write('</center></page>') # make summary details for i, e in enumerate(db_list): if (i % args.number_break_summary == 0) and (args.printable_format_a4 == 1): _end = (i + args.number_break_summary) if len(db_list) > ( i + args.number_break_summary) else len(db_list) html.write( '<page size="A4"><center><h3>Papers {s} - {e}</h3>'.format( s=i + 1, e=_end)) elif (i == 0) and (args.printable_format_a4 == 0): html.write('<page><center><h3>Papers {s} - {e}</h3>'.format( s=i + 1, e=len(db_list))) html.write('<p><table width="750px">') # Number, ID, Tags html.write( '<tr><td style="padding: 0; border: none;"><table style="border: none;" width="750px">' ) html.write( '<tr><td align="center" width="30px" style="border: none;"><a id="{aid}"><b>{n}</b></a></td>' .format(aid=e, n=i + 1)) html.write( '<td align="center" width="250px" style="{s}"><a href="{link}">{id}</a></td>' .format( s= 'border: none; border-right: 1px black solid; border-left: 1px black solid;', link=strip_version(db[e]['id']), id=strip_version(db[e]['id']), )) tags = [t['term'] for t in db[e]['tags']] html.write('<td width="450px" style="border: none;">{}</td>'.format( ' | '.join(tags))) html.write( '<td width="20px" style="border: none;"><a href="#top">top</a></td></tr>' ) html.write('</table></td></tr>') # Authors authors = [a['name'] for a in db[e]['authors']] html.write('<tr><td>{}</td></tr>'.format(', '.join(authors))) # Title html.write('<tr><td>{}</td></tr>'.format(db[e]['title'])) # Summary html.write('<tr><td>{}</td></tr>'.format(db[e]['summary'])) html.write('</table></p>') if (((i + 1) % args.number_break_summary == 0) or (i == len(db_list))) and (args.printable_format_a4 == 1): html.write('</center></page>') elif (i == len(db_list)) and (args.printable_format_a4 == 0): html.write('</center></page>') html.write('</body></html>') html.close() print('saved {}'.format(fname))
def get_tags(doc_id): return [tag['term'] for tag in db[doc_id]['tags']] top_n = 10 # def get_similar(doc_id): # return get_similar_t(doc_id) # return d2v_get_similar(doc_id) # return [ doc for doc, _ in model.docvecs.most_similar(doc_id)[:top_n]] files = set(os.listdir(Config.txt_dir)) total_sim = 0 file_num = 0 for i, f in enumerate(files): doc_id = f.split('.pdf')[0] # print(doc_id) query_tags = set(get_tags(strip_version(doc_id))) similar_doc = get_similar(doc_id) # if not similar_doc: continue file_num += 1 for doc in similar_doc: # print(db[doc[0].split('v')[0]]['tags']['term']) doc_tags = get_tags(strip_version(doc)) inter = set(doc_tags) & query_tags if inter: total_sim += 1 # else: # print(doc_tags, query_tags) print(f'{total_sim}/{file_num*top_n}: {total_sim/file_num/top_n}')
def get_similar(pid): pid = strip_version(pid) ID = pids.index(pid) return [pids[IX[i][ID]] for i in range(1, 11)]
print('lib', lib) pids = [x['paperId'] for x in lib] posix = [xtoi[p] for p in pids if p in xtoi] if not posix: continue # empty library for this user maybe? print(pids) print('posix:', posix) y = np.zeros(X.shape[0]) for ix in posix: y[ix] = 1 clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1) clf.fit(X, y) s = clf.decision_function(X) sortix = np.argsort(-s) sortix = sortix[:min(num_recommendations, len( sortix))] # crop paper recommendations to save space user_sim[user][collection] = [ strip_version(meta['pids'][ix]) for ix in list(sortix) ] print(user_sim) print('writing', Config.user_sim_path) safe_pickle_dump(user_sim, Config.user_sim_path)