Пример #1
0
def papers_from_svm(recent_days=None):
    out = []
    if g.user:

        uid = session["user_id"]
        if not uid in user_sim:
            return []

        # we want to exclude papers that are already in user library from the result, so fetch them.
        user_library = query_db("""select * from library where user_id = ?""", [uid])
        libids = {strip_version(x["paper_id"]) for x in user_library}

        plist = user_sim[uid]
        out = [db[x] for x in plist if not x in libids]

        if recent_days is not None:
            # filter as well to only most recent papers
            curtime = int(time.time())  # in seconds
            out = [
                x
                for x in out
                if curtime - x["time_published"] < recent_days * 24 * 60 * 60
            ]

    return out
Пример #2
0
def review():
    """ user wants to toggle a paper in his library """

    # make sure user is logged in
    if not g.user:
        return "NO"  # fail... (not logged in). JS should prevent from us getting here.

    idvv = request.form["pid"]  # includes version
    if not isvalidid(idvv):
        return "NO"  # fail, malformed id. weird.
    pid = strip_version(idvv)
    if not pid in db:
        return "NO"  # we don't know this paper. wat

    uid = session["user_id"]  # id of logged in user

    # check this user already has this paper in library
    record = query_db(
        """select * from library where
          user_id = ? and paper_id = ?""",
        [uid, pid],
        one=True,
    )
    print(record)

    ret = "NO"
    if record:
        # record exists, erase it.
        g.db.execute(
            """delete from library where user_id = ? and paper_id = ?""", [uid, pid]
        )
        g.db.commit()
        # print('removed %s for %s' % (pid, uid))
        ret = "OFF"
    else:
        # record does not exist, add it.
        rawpid = strip_version(pid)
        g.db.execute(
            """insert into library (paper_id, user_id, update_time) values (?, ?, ?)""",
            [rawpid, uid, int(time.time())],
        )
        g.db.commit()
        # print('added %s for %s' % (pid, uid))
        ret = "ON"

    return ret
Пример #3
0
def papers_from_library():
    out = []
    if g.user:
        # user is logged in, lets fetch their saved library data
        uid = session["user_id"]
        user_library = query_db("""select * from library where user_id = ?""", [uid])
        libids = [strip_version(x["paper_id"]) for x in user_library]
        out = [db[x] for x in libids if x in db]
        out = sorted(out, key=lambda k: k["updated"], reverse=True)
    return out
Пример #4
0
def papers_from_library():
  out = []
  if g.user:
    # user is logged in, lets fetch their saved library data
    uid = session['user_id']
    user_library = query_db('''select * from library where user_id = ?''', [uid])
    libids = [strip_version(x['paper_id']) for x in user_library]
    out = [db[x] for x in libids]
    out = sorted(out, key=lambda k: k['updated'], reverse=True)
  return out
Пример #5
0
def encode_json(ps, n=10, send_images=True, send_abstracts=True):

    libids = set()
    if g.user:
        # user is logged in, lets fetch their saved library data
        uid = session["user_id"]
        user_library = query_db("""select * from library where user_id = ?""", [uid])
        libids = {strip_version(x["paper_id"]) for x in user_library}

    ret = []
    for i in range(min(len(ps), n)):
        p = ps[i]
        idvv = "%sv%d" % (p["_rawid"], p["_version"])
        struct = {}
        struct["title"] = p["title"]
        struct["pid"] = idvv
        struct["rawpid"] = p["_rawid"]
        struct["category"] = p["arxiv_primary_category"]["term"]
        struct["authors"] = [a["name"] for a in p["authors"]]
        struct["link"] = p["link"]
        struct["in_library"] = 1 if p["_rawid"] in libids else 0
        if send_abstracts:
            struct["abstract"] = p["summary"]
        if send_images:
            struct["img"] = "/static/thumbs/" + idvv + ".pdf.jpg"
        struct["tags"] = [t["term"] for t in p["tags"]]

        # render time information nicely
        timestruct = dateutil.parser.parse(p["updated"])
        struct["published_time"] = "%s/%s/%s" % (
            timestruct.month,
            timestruct.day,
            timestruct.year,
        )
        timestruct = dateutil.parser.parse(p["published"])
        struct["originally_published_time"] = "%s/%s/%s" % (
            timestruct.month,
            timestruct.day,
            timestruct.year,
        )

        # fetch amount of discussion on this paper
        struct["num_discussion"] = comments.count({"pid": p["_rawid"]})

        # arxiv comments from the authors (when they submit the paper)
        cc = p.get("arxiv_comment", "")
        if len(cc) > 100:
            cc = cc[:100] + "..."  # crop very long comments
        struct["comment"] = cc

        ret.append(struct)
    return ret
Пример #6
0
def friends():

    ttstr = request.args.get("timefilter", "week")  # default is week
    legend = {"day": 1, "3days": 3, "week": 7, "month": 30, "year": 365}
    tt = legend.get(ttstr, 7)

    papers = []
    pid_to_users = {}
    if g.user:
        # gather all the people we are following
        username = get_username(session["user_id"])
        edges = list(follow_collection.find({"who": username}))
        # fetch all papers in all of their libraries, and count the top ones
        counts = {}
        for edict in edges:
            whom = edict["whom"]
            uid = get_user_id(whom)
            user_library = query_db(
                """select * from library where user_id = ?""", [uid]
            )
            libids = [strip_version(x["paper_id"]) for x in user_library]
            for lid in libids:
                if not lid in counts:
                    counts[lid] = []
                counts[lid].append(whom)

        keys = list(counts.keys())
        keys.sort(key=lambda k: len(counts[k]), reverse=True)  # descending by count
        papers = [db[x] for x in keys]
        # finally filter by date
        curtime = int(time.time())  # in seconds
        papers = [
            x for x in papers if curtime - x["time_published"] < tt * 24 * 60 * 60
        ]
        # trim at like 100
        if len(papers) > 100:
            papers = papers[:100]
        # trim counts as well correspondingly
        pid_to_users = {p["_rawid"]: counts.get(p["_rawid"], []) for p in papers}

    if not g.user:
        msg = "You must be logged in and follow some people to enjoy this tab."
    else:
        if len(papers) == 0:
            msg = "No friend papers present. Try to extend the time range, or add friends by clicking on your account name (top, right)"
        else:
            msg = "Papers in your friend's libraries:"

    ctx = default_context(
        papers, render_format="friends", pid_to_users=pid_to_users, msg=msg
    )
    return render_template("main.html", **ctx)
Пример #7
0
def papers_similar(pid):
    rawpid = strip_version(pid)

    # check if we have this paper at all, otherwise return empty list
    if not rawpid in db:
        return []

    # check if we have distances to this specific version of paper id (includes version)
    if pid in sim_dict:
        # good, simplest case: lets return the papers
        return [db[strip_version(k)] for k in sim_dict[pid]]
    else:
        # ok we don't have this specific version. could be a stale URL that points to,
        # e.g. v1 of a paper, but due to an updated version of it we only have v2 on file
        # now. We want to use v2 in that case.
        # lets try to retrieve the most recent version of this paper we do have
        kok = [k for k in sim_dict if rawpid in k]
        if kok:
            # ok we have at least one different version of this paper, lets use it instead
            id_use_instead = kok[0]
            return [db[strip_version(k)] for k in sim_dict[id_use_instead]]
        else:
            # return just the paper. we dont have similarities for it for some reason
            return [db[rawpid]]
Пример #8
0
def encode_json(ps, n=10, send_images=True, send_abstracts=True):

    libids = set()
    if g.user:
        # user is logged in, lets fetch their saved library data
        uid = session['user_id']
        user_library = query_db('''select * from library where user_id = ?''',
                                [uid])
        libids = {strip_version(x['paper_id']) for x in user_library}

    ret = []
    for i in range(min(len(ps), n)):
        p = ps[i]
        idvv = '%sv%d' % (p['_rawid'], p['_version'])
        struct = {}
        struct['title'] = p['title']
        struct['pid'] = idvv
        struct['rawpid'] = p['_rawid']
        struct['category'] = p['arxiv_primary_category']['term']
        struct['authors'] = [a['name'] for a in p['authors']]
        struct['link'] = p['link']
        struct['in_library'] = 1 if p['_rawid'] in libids else 0
        if send_abstracts:
            struct['abstract'] = p['summary']
        if send_images:
            struct['img'] = '/static/thumbs/' + idvv + '.pdf.jpg'
        struct['tags'] = [t['term'] for t in p['tags']]

        # render time information nicely
        timestruct = dateutil.parser.parse(p['updated'])
        struct['published_time'] = '%s/%s/%s' % (
            timestruct.month, timestruct.day, timestruct.year)
        timestruct = dateutil.parser.parse(p['published'])
        struct['originally_published_time'] = '%s/%s/%s' % (
            timestruct.month, timestruct.day, timestruct.year)

        # fetch amount of discussion on this paper
        struct['num_discussion'] = comments.count({'pid': p['_rawid']})

        # arxiv comments from the authors (when they submit the paper)
        cc = p.get('arxiv_comment', '')
        if len(cc) > 100:
            cc = cc[:100] + '...'  # crop very long comments
        struct['comment'] = cc

        ret.append(struct)
    return ret
def friends():
    
    ttstr = request.args.get('timefilter', 'week') # default is week
    legend = {'day':1, '3days':3, 'week':7, 'month':30, 'year':365}
    tt = legend.get(ttstr, 7)

    papers = []
    pid_to_users = {}
    if g.user:
        # gather all the people we are following
        username = get_username(session['user_id'])
        edges = list(follow_collection.find({ 'who':username }))
        # fetch all papers in all of their libraries, and count the top ones
        counts = {}
        for edict in edges:
            whom = edict['whom']
            uid = get_user_id(whom)
            user_library = query_db('''select * from library where user_id = ?''', [uid])
            libids = [strip_version(x['paper_id']) for x in user_library]
            for lid in libids:
                if not lid in counts:
                    counts[lid] = []
                counts[lid].append(whom)

        keys = list(counts.keys())
        keys.sort(key=lambda k: len(counts[k]), reverse=True) # descending by count
        papers = [db[x] for x in keys]
        # finally filter by date
        curtime = int(time.time()) # in seconds
        papers = [x for x in papers if curtime - x['time_published'] < tt*24*60*60]
        # trim at like 100
        if len(papers) > 100: papers = papers[:100]
        # trim counts as well correspondingly
        pid_to_users = { p['_rawid'] : counts.get(p['_rawid'], []) for p in papers }

    if not g.user:
        msg = "You must be logged in and follow some people to enjoy this tab."
    else:
        if len(papers) == 0:
            msg = "No friend papers present. Try to extend the time range, or add friends by clicking on your account name (top, right)"
        else:
            msg = "Papers in your friend's libraries:"

    ctx = default_context(papers, render_format='friends', pid_to_users=pid_to_users, msg=msg)
    return render_template('main.html', **ctx)
Пример #10
0
def encode_json(ps, n=10, send_images=True, send_abstracts=True):

  libids = set()
  if g.user:
    # user is logged in, lets fetch their saved library data
    uid = session['user_id']
    user_library = query_db('''select * from library where user_id = ?''', [uid])
    libids = {strip_version(x['paper_id']) for x in user_library}

  ret = []
  for i in range(min(len(ps),n)):
    p = ps[i]
    idvv = '%sv%d' % (p['_rawid'], p['_version'])
    struct = {}
    struct['title'] = p['title']
    struct['pid'] = idvv
    struct['rawpid'] = p['_rawid']
    struct['category'] = p['arxiv_primary_category']['term']
    struct['authors'] = [a['name'] for a in p['authors']]
    struct['link'] = p['link']
    struct['in_library'] = 1 if p['_rawid'] in libids else 0
    if send_abstracts:
      struct['abstract'] = p['summary']
    if send_images:
      struct['img'] = '/static/thumbs/' + idvv + '.pdf.jpg'
    struct['tags'] = [t['term'] for t in p['tags']]
    
    # render time information nicely
    timestruct = dateutil.parser.parse(p['updated'])
    struct['published_time'] = '%s/%s/%s' % (timestruct.month, timestruct.day, timestruct.year)
    timestruct = dateutil.parser.parse(p['published'])
    struct['originally_published_time'] = '%s/%s/%s' % (timestruct.month, timestruct.day, timestruct.year)

    # fetch amount of discussion on this paper
    struct['num_discussion'] = comments.count({ 'pid': p['_rawid'] })

    # arxiv comments from the authors (when they submit the paper)
    cc = p.get('arxiv_comment', '')
    if len(cc) > 100:
      cc = cc[:100] + '...' # crop very long comments
    struct['comment'] = cc

    ret.append(struct)
  return ret
def papers_from_svm(recent_days=None):
  out = []
  if g.user:

    uid = session['user_id']
    if not uid in user_sim:
      return []
    
    # we want to exclude papers that are already in user library from the result, so fetch them.
    user_library = query_db('''select * from library where user_id = ?''', [uid])
    libids = {strip_version(x['paper_id']) for x in user_library}

    plist = user_sim[uid]
    out = [db[x] for x in plist if not x in libids]

    if recent_days is not None:
      # filter as well to only most recent papers
      curtime = int(time.time()) # in seconds
      out = [x for x in out if curtime - x['time_published'] < recent_days*24*60*60]

  return out
Пример #12
0
def papers_similar(pid):
    rawpid = strip_version(pid)
    return []
Пример #13
0
def get_similar(pid):
    rawpid = strip_version(pid)
    if pid in sim_dict:
        # good, simplest case: lets return the papers
        return [k for k in sim_dict[pid]]
    return []
Пример #14
0
    return (rv[0] if rv else None) if one else rv


# -----------------------------------------------------------------------------

# fetch all users
users = query_db('''select * from user''')
print('number of users: ', len(users))

# load the tfidf matrix and meta
meta = pickle.load(open(Config.meta_path, 'rb'))
out = pickle.load(open(Config.tfidf_path, 'rb'))
X = out['X']
X = X.todense().astype(np.float32)

xtoi = {strip_version(x): i for x, i in meta['ptoi'].items()}

user_sim = {}
for ii, u in enumerate(users):
    print("%d/%d building an SVM for %s" %
          (ii, len(users), u['username'].encode('utf-8')))
    uid = u['user_id']
    lib = query_db('''select * from library where user_id = ?''', [uid])
    pids = [x['paper_id'] for x in lib]  # raw pids without version
    posix = [xtoi[p] for p in pids if p in xtoi]

    if not posix:
        continue  # empty library for this user maybe?

    print(pids)
    y = np.zeros(X.shape[0])
  rv = cur.fetchall()
  return (rv[0] if rv else None) if one else rv

# -----------------------------------------------------------------------------

# fetch all users
users = query_db('''select * from user''')
print('number of users: ', len(users))

# load the tfidf matrix and meta
meta = pickle.load(open(Config.meta_path, 'rb'))
out = pickle.load(open(Config.tfidf_path, 'rb'))
X = out['X']
X = X.todense()

xtoi = { strip_version(x):i for x,i in meta['ptoi'].items() }

user_sim = {}
for ii,u in enumerate(users):
  print("%d/%d building an SVM for %s" % (ii, len(users), u['username'].encode('utf-8')))
  uid = u['user_id']
  lib = query_db('''select * from library where user_id = ?''', [uid])
  pids = [x['paper_id'] for x in lib] # raw pids without version
  posix = [xtoi[p] for p in pids if p in xtoi]
  
  if not posix:
    continue # empty library for this user maybe?

  print(pids)
  y = np.zeros(X.shape[0])
  for ix in posix: y[ix] = 1
def get_similar(pid):
    pid = strip_version(pid)
    ID = pids.index(pid)
    distances, indices = nbrs.kneighbors([X[ID]])
    return [pids[i] for i in indices[0]]
Пример #17
0
def create_html(args, db, pcates):
    db_list = list(db.keys())
    sort_by = date_sort_by(args.date_sort_by).split('_')[0]
    fname = '{d}-{ds}-{n}-{c}.html'.format(d=args.report_date,
                                           ds=sort_by,
                                           n=len(db),
                                           c='+'.join(pcates))
    path = os.path.join(args.report_path, fname)
    html = open(path, 'w')

    html.write('<html><head>')
    html.write('<title>arXiv {ds} {d}</title>'.format(ds=sort_by,
                                                      d=args.report_date))
    html.write('<style type="text/css">')
    css = add_css()
    html.write(css)
    html.write('</style></head><body>')
    html.write('<a id="top"></a>')

    # make contents list
    for i, e in enumerate(db_list):
        if (i % args.number_break_contents == 0) and (args.printable_format_a4
                                                      == 1):
            html.write(
                '<page size="A4"><center><h1>Report arXiv {ds} {d}</h1>'.
                format(ds=sort_by, d=args.report_date))
            html.write('<table width="750px">')
            html.write(
                '<tr><th>N</th><th>ID</th><th>Title</th><th>P.C.</th></tr>')
        elif (i == 0) and (args.printable_format_a4 == 0):
            html.write('<page><center><h1>Report arXiv {ds} {d}</h1>'.format(
                ds=sort_by, d=args.report_date))
            html.write('<table width="750px">')
            html.write(
                '<tr><th>N</th><th>ID</th><th>Title</th><th>P.C.</th></tr>')
        html.write('<tr><td>{}</td>'.format(i + 1))
        html.write('<td align="center"><a href="{link}">{id}</a></td>'.format(
            link=strip_version(db[e]['id']), id=e))
        html.write('<td><a href="#{aid}">{t}</a></td>'.format(
            aid=e, t=db[e]['title']))
        html.write('<td>{}</td></tr>'.format(
            db[e]['arxiv_primary_category']['term']))
        if (((i + 1) % args.number_break_contents == 0) or
            (i == len(db_list) - 1)) and (args.printable_format_a4 == 1):
            html.write('</table>')
            html.write('</center></page>')
        elif (i == len(db_list) - 1) and (args.printable_format_a4 == 0):
            html.write('</table>')
            html.write('</center></page>')

    # make summary details
    for i, e in enumerate(db_list):
        if (i % args.number_break_summary == 0) and (args.printable_format_a4
                                                     == 1):
            _end = (i + args.number_break_summary) if len(db_list) > (
                i + args.number_break_summary) else len(db_list)
            html.write(
                '<page size="A4"><center><h3>Papers {s} - {e}</h3>'.format(
                    s=i + 1, e=_end))
        elif (i == 0) and (args.printable_format_a4 == 0):
            html.write('<page><center><h3>Papers {s} - {e}</h3>'.format(
                s=i + 1, e=len(db_list)))
        html.write('<p><table width="750px">')
        # Number, ID, Tags
        html.write(
            '<tr><td style="padding: 0; border: none;"><table style="border: none;" width="750px">'
        )
        html.write(
            '<tr><td align="center" width="30px" style="border: none;"><a id="{aid}"><b>{n}</b></a></td>'
            .format(aid=e, n=i + 1))
        html.write(
            '<td align="center" width="250px" style="{s}"><a href="{link}">{id}</a></td>'
            .format(
                s=
                'border: none; border-right: 1px black solid; border-left: 1px black solid;',
                link=strip_version(db[e]['id']),
                id=strip_version(db[e]['id']),
            ))
        tags = [t['term'] for t in db[e]['tags']]
        html.write('<td width="450px" style="border: none;">{}</td>'.format(
            ' | '.join(tags)))
        html.write(
            '<td width="20px" style="border: none;"><a href="#top">top</a></td></tr>'
        )
        html.write('</table></td></tr>')
        # Authors
        authors = [a['name'] for a in db[e]['authors']]
        html.write('<tr><td>{}</td></tr>'.format(', '.join(authors)))
        # Title
        html.write('<tr><td>{}</td></tr>'.format(db[e]['title']))
        # Summary
        html.write('<tr><td>{}</td></tr>'.format(db[e]['summary']))
        html.write('</table></p>')
        if (((i + 1) % args.number_break_summary == 0) or
            (i == len(db_list))) and (args.printable_format_a4 == 1):
            html.write('</center></page>')
        elif (i == len(db_list)) and (args.printable_format_a4 == 0):
            html.write('</center></page>')

    html.write('</body></html>')
    html.close()
    print('saved {}'.format(fname))
def get_tags(doc_id):
    return [tag['term'] for tag in db[doc_id]['tags']]


top_n = 10
# def get_similar(doc_id):
#     return get_similar_t(doc_id)
# return d2v_get_similar(doc_id)
# return [ doc for doc, _ in model.docvecs.most_similar(doc_id)[:top_n]]

files = set(os.listdir(Config.txt_dir))
total_sim = 0
file_num = 0
for i, f in enumerate(files):
    doc_id = f.split('.pdf')[0]
    # print(doc_id)
    query_tags = set(get_tags(strip_version(doc_id)))
    similar_doc = get_similar(doc_id)
    # if not similar_doc: continue
    file_num += 1
    for doc in similar_doc:
        # print(db[doc[0].split('v')[0]]['tags']['term'])
        doc_tags = get_tags(strip_version(doc))
        inter = set(doc_tags) & query_tags
        if inter:
            total_sim += 1
        # else:
        #     print(doc_tags, query_tags)

print(f'{total_sim}/{file_num*top_n}: {total_sim/file_num/top_n}')
Пример #19
0
def get_similar(pid):
    pid = strip_version(pid)
    ID = pids.index(pid)
    return [pids[IX[i][ID]] for i in range(1, 11)]
Пример #20
0
        print('lib', lib)
        pids = [x['paperId'] for x in lib]
        posix = [xtoi[p] for p in pids if p in xtoi]

        if not posix:
            continue  # empty library for this user maybe?

        print(pids)
        print('posix:', posix)
        y = np.zeros(X.shape[0])
        for ix in posix:
            y[ix] = 1

        clf = svm.LinearSVC(class_weight='balanced',
                            verbose=False,
                            max_iter=10000,
                            tol=1e-6,
                            C=0.1)
        clf.fit(X, y)
        s = clf.decision_function(X)

        sortix = np.argsort(-s)
        sortix = sortix[:min(num_recommendations, len(
            sortix))]  # crop paper recommendations to save space
        user_sim[user][collection] = [
            strip_version(meta['pids'][ix]) for ix in list(sortix)
        ]
        print(user_sim)
print('writing', Config.user_sim_path)
safe_pickle_dump(user_sim, Config.user_sim_path)