def add_tab(): """add a new tab to current user { name: 'string' # name of the tab priority: integer # bigger, better. default to be 0 } """ try: data = json.loads(request.data) name = data['name'] priority = int(data.get('priority', 0)) assert isinstance(name, basestring) except: return {'error': 'illegal format'} username = current_user.username doc = get_user(username) for tab in doc['tab']: if tab['name'] == name: return {'error': 'tab with this name already exists!'} doc['tab'].append({ 'name': name, 'priority': priority, 'tags': [] }) get_mongo('user').save(doc) log_info('user {0} add tab {1}'.format(username, name)) return {'success': 1}
def add_tag(): """ { name: 'tagname', tab: 'tabname' } """ try: data = json.loads(request.data) tagname = data['name'] tabname = data['tab'] assert isinstance(tagname, basestring) \ and isinstance(tabname, basestring) except: return {'error': 'illegal format'} u = get_user(current_user.username) for tab in u['tab']: if tabname == tab['name']: l = tab['tags'] l.append(tagname) l = list(set(l)) tab['tags'] = l get_mongo('user').save(u) log_info('user {0} add tag {1} to tab \ {2}'.format(current_user.username, tagname, tabname)) return {'tabs': u['tab']} return {'error': 'no such tab'}
def del_tag(): """ GET /del_tag?name=xxx&tab=xxx """ try: data = request.args tagname = data['name'] tabname = data['tab'] assert isinstance(tagname, basestring) \ and isinstance(tabname, basestring) except: return {'error': 'illegal format'} u = get_user(current_user.username) for tab in u['tab']: if tabname == tab['name']: try: tab['tags'].remove(tagname) except ValueError: return {'error': 'tag {0} not in tab {1}'.format(tagname, tabname)} get_mongo('user').save(u) log_info('user {0} del tag {1} in tab \ {2}'.format(current_user.username, tagname, tabname)) return {'tabs': u['tab']} return {'error': 'no such tab'}
def set_tag(): """ { name: ['tagname'], tab: 'tabname' } """ try: data = json.loads(request.data) tags = data['name'] tabname = data['tab'] assert isinstance(tabname, basestring) assert isinstance(tags, list) for tag in tags: assert isinstance(tag, basestring) except: return {'error': 'illegal format'} u = get_user(current_user.username) for tab in u['tab']: if tabname == tab['name']: tab['tags'] = tags get_mongo('user').save(u) log_info('user {0} set tag to {1} on tab \ {2}'.format(current_user.username, tags, tabname)) return {'tabs': u['tab']} return {'error': 'no such tab'}
def get_tab_article(): """get all articles under a tab GET /get_tab_article?tab=tabname """ try: tabname = request.args['tab'] assert isinstance(tabname, basestring) except: return {'error': 'illegal format'} db = get_user(current_user.username) tab = filter(lambda x: x['name'] == tabname, db['tab']) if len(tab) == 0: return {'error': 'no such tab'} tags = tab[0]['tags'] itemdb = get_mongo('item') rst = list(itemdb.find({'tag': {'$in': tags}, '$or': [ {'fetcher_type': FETCHER_TYPE_GENERAL}, { 'fetcher_type': FETCHER_TYPE_USER, 'other.user_id': current_user.username } ]}, {'_id': 0})) rst = parse_article(rst) return {'data': rst}
def main(): label_cnt = defaultdict(int) db = get_mongo('item') data = [] for item in db.find(): desc = ItemDescBase.deserialize(item['desc']) labels = item['tag'] if not labels: continue for l in labels: label_cnt[l] += 1 doc = desc.render_content() data.append((doc, labels)) available_labels = set() total_cnt = sum(label_cnt.values()) print total_cnt for label, cnt in label_cnt.iteritems(): if cnt > total_cnt * 0.0015 and cnt < total_cnt * 0.1: available_labels.add(label) print 'remaining labels: ', len(available_labels) print("#documents: {}" . format(len(data))) print("training ...\n") random.shuffle(data) data = data[:MAX_DATA_SIZE] data = filter_data_label(data, available_labels) tagger = TextTagger(nr_min_word_count=3) tagger.fit(data) print("writing model...\n") tagger.dump(ukconfig.tagger_path)
def rebuild(self): self.indexer.clear() db = get_mongo('paper') itr = db.find({}, {'pdf': 1, 'title': 1, 'text': 1}) for res in itr: text = res.get('text') if not text: log_info("About to add text for paper {0}".format(res['_id'])) try: data = res['pdf'] text = pdf2text(data) except KeyError: log_err("No pdf in pid={0},title={1}".format( res['_id'], res['title'])) continue except Exception: log_exc("Exception in pdf2text") db.update({'_id': res['_id']}, {'$set': {'text': text}}) doc = {'text': text, 'title': res['title'], 'id': res['_id'] } self._do_add_paper(doc) self.indexer.flush()
def get_paper_list(name): db = get_mongo('paper') res = list(db.find({'author': name.lower(), 'pdf': {'$exists': True}}, {'title': 1})) def transform(r): return (r["_id"], r["title"]) return map(transform, res)
def download(): pid = long(request.values.get('pid')) agent = str(request.user_agent) db = get_mongo('paper') doc = db.find_and_modify(query={'_id': pid}, update={'$inc': {'download_cnt': 1}}, fields={'pdf': 1, 'title': 1} ) title = title_beautify(doc['title']) if not doc: return make_response(''), 404 data = doc['pdf'] resp = make_response(data) resp.headers['Content-Type'] = 'application/pdf' # chrome doesn't work with comma in filename #if agent.find('Chrom') != -1: #title = title.replace(',', ' ') # TODO deal with unicode name! resp.headers['Content-Disposition'] = \ 'attachment; filename="{0}.pdf"'.format(title) return resp
def download(): pid = long(request.values.get('pid')) agent = str(request.user_agent) db = get_mongo('paper') doc = db.find_and_modify(query={'_id': pid}, update={'$inc': { 'download_cnt': 1 }}, fields={ 'pdf': 1, 'title': 1 }) title = title_beautify(doc['title']) if not doc: return make_response(''), 404 data = doc['pdf'] resp = make_response(data) resp.headers['Content-Type'] = 'application/pdf' # chrome doesn't work with comma in filename #if agent.find('Chrom') != -1: #title = title.replace(',', ' ') # TODO deal with unicode name! resp.headers['Content-Disposition'] = \ 'attachment; filename="{0}.pdf"'.format(title) return resp
def init_title_for_similar_search(): if len(all_titles) > 0: return db = get_mongo('paper') itr = db.find({}, {'title': 1}) for cand in itr: add_title_for_similar_search((cand['title'], cand['_id']))
def register(): """user registration api. username: string password: string XXX TODO """ try: username = request.values['username'] password = request.values['password'] assert isinstance(username, basestring) assert isinstance(password, basestring) except: return {'error': 'illegal format'} if len(username) < 3 or len(password) < 3: return {'error': 'length of username and password must' + ' be at least 3 characters'} exist = get_user(username) if exist: return {"error": "user {0} already exists".format(username)} db = get_mongo('user') db.insert({ 'username': username, 'password': password, 'tab': [] }) log_info('new user: {0}:{1}'.format(username, password)) return {'success': 1}
def _do_new_item(self, desc, initial_tag, create_time=None, other=None): """helper function for implementing :meth:`new_item`""" assert isinstance(desc, ItemDescBase), \ 'bad desc: {!r}'.format(type(desc)) assert isinstance(initial_tag, list) and \ all([isinstance(i, basestring) for i in initial_tag]), \ 'bad initial_tag: {!r}'.format(initial_tag) assert other is None or isinstance(other, dict), \ 'bad other arg: {!r}'.format(other) initial_tag = map(unicode, initial_tag) declare_tag(initial_tag) if create_time is None: create_time = time.localtime() db = get_mongo('item') item_id = global_counter('item') db.ensure_index('fetcher_type') db.ensure_index('fetcher_name') db.ensure_index('tag') db.ensure_index('creation_time') doc = { '_id': item_id, 'fetcher_type': self.fetcher_type, 'fetcher_name': self.fetcher_name, 'desc': deepcopy(desc), 'tag': initial_tag, 'other': other, 'creation_time': datetime.fromtimestamp(time.mktime(create_time))} prefilter.apply(self, doc) doc['desc'] = Binary(doc['desc'].serialize()) db.insert(doc) return item_id
def search_startswith(query): db = get_mongo('paper') res = list(db.find({'title': {'$regex': '^{0}'.format(query) } }, SEARCH_RETURN_FIELDS)) res = [k for k in res if levenshtein(k['title'], query) < 10] print res return res
def do_addhtml(data, pid): # convert to html converter = PDF2Html(data, filename=None) npage = converter.get_npages() htmls = [Binary(converter.get(x)) for x in range(npage + 1)] converter.clean() db = get_mongo('paper') db.update({'_id': pid}, {'$set': {'page': npage, 'html': htmls}}) log_info("Add html for pdf {0}, page={1}".format(pid, npage))
def authenticate(self, username, password): db = get_mongo('user') exist = list(db.find({'username': username}, {"_id": 0}).limit(1)) if len(exist) == 0: self.error = {'error': 'no such user'} return assert len(exist) == 1, 'More than one user!' user = exist[0] if user['password'] == password: self._authenticated = True else: self.error = {'error': 'wrong password'}
def do_compress(data, pid): """ this *must* succeed adding the pdf""" try: # compress data = pdf_compress(data) except: pass db = get_mongo('paper') db.update({'_id': pid}, {'$set': {'pdf': Binary(data)}} ) log_info("Updated pdf {0}: size={1}".format( pid, parse_file_size(len(data)))) return data
def do_compress(data, pid): """ this *must* succeed adding the pdf""" try: # compress data = pdf_compress(data) except: pass db = get_mongo('paper') db.update({'_id': pid}, {'$set': {'pdf': Binary(data)}}) log_info("Updated pdf {0}: size={1}".format(pid, parse_file_size(len(data)))) return data
def mark(): """ get marks of the paper with pid """ try: pid = long(request.values.get('pid')) except Exception: return {'status': 'error', 'reason': 'invalid request'} db = get_mongo('paper') res = db.find_one({'_id': pid}, {'upvote': 1, 'downvote': 1}) log_info("Return marks of pdf {0}".format(pid)) if res is None: return {} return res
def mark(): """ get marks of the paper with pid """ try: pid = long(request.values.get("pid")) except Exception: return {"status": "error", "reason": "invalid request"} db = get_mongo("paper") res = db.find_one({"_id": pid}, {"upvote": 1, "downvote": 1}) log_info("Return marks of pdf {0}".format(pid)) if res is None: return {} return res
def similar_search(query): """ return one result that is most similar to query""" ret = [] query = query.strip().lower() for cand in all_titles: dist = levenshtein(query, cand[0]) if dist < 3: ret.append((cand, dist)) if not ret: return None res = max(ret, key=operator.itemgetter(1)) db = get_mongo('paper') res = db.find_one({'_id': res[0][1]}, SEARCH_RETURN_FIELDS) return res
def do_mark(): """ update db with user's mark & uid """ try: pid = long(request.values.get('pid')) mark = int(request.values.get('mark')) except Exception: return {'status': 'error', 'reason': 'invalid request'} db = get_mongo('paper') if mark == 1: db.update({'_id': pid}, {'$inc': {'upvote': 1}}) else: db.update({'_id': pid}, {'$inc': {'downvote': 1}}) log_info("Add mark to pdf {0}".format(pid)) return {'status': 'ok'}
def get_comment(): """ return first 10 comments of the paper with pid """ try: pid = long(request.values.get('pid')) page = int(request.values.get('page')) except Exception: return {'status': 'error', 'reason': 'invalid request'} db = get_mongo('paper') res = db.find_one({'_id': pid}, {'comments': {'$slice': [page*10, 10]}, 'cmt_count': 1}) log_info("Return 10 comments of paper {0}".format(pid)) if res is None: return {} return res
def do_mark(): """ update db with user's mark & uid """ try: pid = long(request.values.get("pid")) mark = int(request.values.get("mark")) except Exception: return {"status": "error", "reason": "invalid request"} db = get_mongo("paper") if mark == 1: db.update({"_id": pid}, {"$inc": {"upvote": 1}}) else: db.update({"_id": pid}, {"$inc": {"downvote": 1}}) log_info("Add mark to pdf {0}".format(pid)) return {"status": "ok"}
def do_comment(): """ update db with user's comment & uid """ try: pid = long(request.values.get('pid')) uid = request.values.get('uid') comment = request.values.get('cmt') except Exception: return {'status': 'error', 'reason': 'invalid request'} db = get_mongo('paper') db.update({'_id': pid}, {'$push': {'comments': {'cmt': comment, 'uid': uid}}}) db.update({'_id': pid}, {'$inc': {'cmt_count': 1}}) log_info("Add {0}'s comment to pdf {1}".format(uid, pid)) return {'status': 'ok'}
def test(): """given user id and return all items""" uid = request.values.get('uid') if not uid: return {'error': 'please visit with uid=1'} t = get_user_fetcher_celery_task() t(uid) data = list(get_mongo('item').find()) def chg(d): for k, v in d.iteritems(): if isinstance(v, datetime): d[k] = str(v) elif isinstance(v, Binary): d[k] = ItemDescBase.deserialize(v).render_title() map(chg, data) return {'data': data}
def handle_content_query(query): log_info("Get content query: {0}".format(query)) res = sp_searcher.search(query) db = get_mongo('paper') def transform(r): pid = long(r['_id']) # XXX should find use '$in' and then do sorting doc = db.find_one({'_id': pid}, SEARCH_RETURN_FIELDS) if not doc: raise Exception("Impossible! Mongo doesn't have this paper in index: {0}".format(pid)) doc['content'] = r['content'] doc['weight'] = r['weight'] return doc ret = map(transform, res) return ret
def html(): """ return a dict of {pagenum: 'html'} """ try: pid = long(request.values.get('pid')) page_str = request.values.get('page') pages = map(int, page_str.split(',')) except Exception: return {'status': 'error', 'reason': 'invalid request'} db = get_mongo('paper') doc = db.find_one({'_id': pid}, {'page': 1, 'html': 1}) if max(pages) > doc['page'] or min(pages) < 0: return {'status': 'error', 'reason': 'invalid page index'} res = {} for p in pages: res[p] = doc['html'][p] return {'status': 'ok', 'htmls': res}
def handle_content_query(query): log_info("Get content query: {0}".format(query)) res = sp_searcher.search(query) db = get_mongo('paper') def transform(r): pid = long(r['_id']) # XXX should find use '$in' and then do sorting doc = db.find_one({'_id': pid}, SEARCH_RETURN_FIELDS) if not doc: raise Exception( "Impossible! Mongo doesn't have this paper in index: {0}". format(pid)) doc['content'] = r['content'] doc['weight'] = r['weight'] return doc ret = map(transform, res) return ret
def do_buildindex(ctx, data, pid): text = pdf2text(data) db = get_mongo('paper') db.update({'_id': pid}, {'$set': {'text': text}}) doc = {'text': text, 'title': ctx.title, 'id': pid} citedby = ctx.meta.get('citedby') if citedby: citecnt = len(citedby) doc['citecnt'] = citecnt if ctx.meta.get('citecnt'): citecnt = ctx.meta.get('citecnt') doc['citecnt'] = citecnt author = ctx.meta.get('author') if author: doc['author'] = author contentsearch.do_add_paper(doc)
def get_comment(): """ return first 10 comments of the paper with pid """ try: pid = long(request.values.get('pid')) page = int(request.values.get('page')) except Exception: return {'status': 'error', 'reason': 'invalid request'} db = get_mongo('paper') res = db.find_one({'_id': pid}, { 'comments': { '$slice': [page * 10, 10] }, 'cmt_count': 1 }) log_info("Return 10 comments of paper {0}".format(pid)) if res is None: return {} return res
def del_tab(): """delete a tab { name: 'tabname' # name of the tab } ignore it when tab with 'tabname' doesn't exist """ try: data = json.loads(request.data) name = data['name'] assert isinstance(name, basestring) except: return {'error': 'illegal format'} db = get_mongo('user') db.update({'username': current_user.username}, {'$pull': { 'tab': { 'name': name }}}) return {'success': 1}
def html(): """ return a dict of {pagenum: 'html'} """ try: pid = long(request.values.get('pid')) page_str = request.values.get('page') pages = map(int, page_str.split(',')) except Exception: return {'status': 'error', 'reason': 'invalid request'} db = get_mongo('paper') doc = db.find_one({'_id': pid}, {'page': 1, 'html': 1}) if max(pages) > doc['page'] or min(pages) < 0: return {'status': 'error', 'reason': 'invalid page index'} res = {} for p in pages: res[p] = doc['html'][p] return {'status': 'ok', 'htmls': res }
def do_comment(): """ update db with user's comment & uid """ try: pid = long(request.values.get('pid')) uid = request.values.get('uid') comment = request.values.get('cmt') except Exception: return {'status': 'error', 'reason': 'invalid request'} db = get_mongo('paper') db.update({'_id': pid}, {'$push': { 'comments': { 'cmt': comment, 'uid': uid } }}) db.update({'_id': pid}, {'$inc': {'cmt_count': 1}}) log_info("Add {0}'s comment to pdf {1}".format(uid, pid)) return {'status': 'ok'}
def available(): try: pid = long(request.values.get('pid')) #log_info("Query available of {0} with dic={1}". #format(pid, str(progress_dict))) except: return {'status': 'error', 'reason': 'Invalid Request'} prgs = progress_dict.get(pid) if prgs is None: db = get_mongo('paper') doc = db.find_one({'_id': pid}, {'page': 1}) if not doc: return {'status': 'error', 'reason': 'no such item'} if doc.get('page'): doc['progress'] = 'done' else: doc['progress'] = 'failed' doc.update({'status': 'ok'}) return doc return {'status': 'ok', 'progress': prgs}
def start_download(dl_candidates, ctx, pid): dl_candidates = sorted(dl_candidates, key=lambda x: x[0].priority, reverse=True) updater = Updater(pid) for (parser, sr) in dl_candidates: data = parser.download(sr, updater) if data: db = get_mongo('paper') try: db.update({'_id': pid}, {'$set': { 'pdf': Binary(data), 'page_url': sr.url, 'source': parser.name }}) except: log_exc("Save pdf data error") postprocess(data, ctx, pid) progress_dict.pop(pid, None) return progress_dict.pop(pid, None)
def do_buildindex(ctx, data, pid): text = pdf2text(data) db = get_mongo('paper') db.update({'_id': pid}, {'$set': {'text': text}}) doc = {'text': text, 'title': ctx.title, 'id': pid } citedby = ctx.meta.get('citedby') if citedby: citecnt = len(citedby) doc['citecnt'] = citecnt if ctx.meta.get('citecnt'): citecnt = ctx.meta.get('citecnt') doc['citecnt'] = citecnt author = ctx.meta.get('author') if author: doc['author'] = author contentsearch.do_add_paper(doc)
def rebuild(self): self.indexer.clear() db = get_mongo('paper') itr = db.find({}, {'pdf': 1, 'title': 1, 'text': 1}) for res in itr: text = res.get('text') if not text: log_info("About to add text for paper {0}".format(res['_id'])) try: data = res['pdf'] text = pdf2text(data) except KeyError: log_err("No pdf in pid={0},title={1}".format( res['_id'], res['title'])) continue except Exception: log_exc("Exception in pdf2text") db.update({'_id': res['_id']}, {'$set': {'text': text}}) doc = {'text': text, 'title': res['title'], 'id': res['_id']} self._do_add_paper(doc) self.indexer.flush()
def dump(pid, output): OUTPUT = os.path.join(os.path.dirname(__file__), output) pid = long(pid) db = get_mongo('paper') doc = list(db.find({'_id': pid}).limit(1))[0] pdf = doc['pdf'] title = doc['title'] try: os.mkdir(OUTPUT) except: pass fout = open(os.path.join(OUTPUT, title + '.pdf'), 'w') fout.write(pdf) fout.close() npage = doc.get('page') if npage: for i in range(npage + 1): fout = open(os.path.join(OUTPUT, title + '.html.{0}'.format(i)), 'w') fout.write(doc['html'][i]) fout.close()
def start_download(dl_candidates, ctx, pid): dl_candidates = sorted(dl_candidates, key=lambda x: x[0].priority, reverse=True) updater = Updater(pid) for (parser, sr) in dl_candidates: data = parser.download(sr, updater) if data: db = get_mongo('paper') try: db.update({'_id': pid}, { '$set': { 'pdf': Binary(data), 'page_url': sr.url, 'source': parser.name } }) except: log_exc("Save pdf data error") postprocess(data, ctx, pid) progress_dict.pop(pid, None) return progress_dict.pop(pid, None)
def handl_author_query(q): db = get_mongo('paper') res = list(db.find({'author': q}, SEARCH_RETURN_FIELDS)) return res
#!./exec-in-virtualenv.sh # -*- coding: UTF-8 -*- # File: tolower.py # Date: 二 6月 10 04:03:22 2014 +0000 # Author: Yuxin Wu <*****@*****.**> from pdfprocess import do_addhtml from ukdbconn import get_mongo db = get_mongo('paper') #itr = db.find({'_id': 67L}) itr = db.find({}, {'author': 1, 'title': 1}) for paper in itr: try: data = paper['author'] except: print paper['_id'], paper['title'] continue pid = paper['_id'] db.update({'_id': pid}, {'$set': {'author': [x.lower() for x in data]}})
def search_regex(regex): db = get_mongo('paper') res = list(db.find({'title': {'$regex': '{0}'.format(query) } }, SEARCH_RETURN_FIELDS)) return res
def search_exact(query): db = get_mongo('paper') res = list(db.find({'title': query}, SEARCH_RETURN_FIELDS)) return res