def transform(r): if r.get('page'): r['haspdf'] = 1 else: r['haspdf'] = 0 try: r['citecnt'] = len(r['citedby']) del r['citedby'] except: r['citecnt'] = 0 if 'author' in r: r['author'] = [title_beautify(x) for x in r['author']] r['title'] = title_beautify(r['title']) return r
def download(): pid = long(request.values.get('pid')) agent = str(request.user_agent) db = get_mongo('paper') doc = db.find_and_modify(query={'_id': pid}, update={'$inc': {'download_cnt': 1}}, fields={'pdf': 1, 'title': 1} ) title = title_beautify(doc['title']) if not doc: return make_response(''), 404 data = doc['pdf'] resp = make_response(data) resp.headers['Content-Type'] = 'application/pdf' # chrome doesn't work with comma in filename #if agent.find('Chrom') != -1: #title = title.replace(',', ' ') # TODO deal with unicode name! resp.headers['Content-Disposition'] = \ 'attachment; filename="{0}.pdf"'.format(title) return resp
def __init__(self, query): query = title_beautify(query) self.query = query self.success = False self.title = query self.existing = None self.meta = {}
def download(): pid = long(request.values.get('pid')) agent = str(request.user_agent) db = get_mongo('paper') doc = db.find_and_modify(query={'_id': pid}, update={'$inc': { 'download_cnt': 1 }}, fields={ 'pdf': 1, 'title': 1 }) title = title_beautify(doc['title']) if not doc: return make_response(''), 404 data = doc['pdf'] resp = make_response(data) resp.headers['Content-Type'] = 'application/pdf' # chrome doesn't work with comma in filename #if agent.find('Chrom') != -1: #title = title.replace(',', ' ') # TODO deal with unicode name! resp.headers['Content-Disposition'] = \ 'attachment; filename="{0}.pdf"'.format(title) return resp
def get_title(self): if self.title is not None: if not self.title: return None else: return self.title try: self.title = title_beautify(self._do_get_title()) return self.title except: self.title = "" return None
def call(query): res = func(query.lower()) for k in res: k['title'] = title_beautify(k['title']) return res
query = query.strip().lower() for cand in all_titles: dist = levenshtein(query, cand[0]) if dist < 3: ret.append((cand, dist)) if not ret: return None res = max(ret, key=operator.itemgetter(1)) db = get_mongo('paper') res = db.find_one({'_id': res[0][1]}, SEARCH_RETURN_FIELDS) return res def add_title_for_similar_search(cand): """ cand = (title, id) """ all_titles.append((cand[0].strip().lower(), cand[1])) def init_title_for_similar_search(): if len(all_titles) > 0: return db = get_mongo('paper') itr = db.find({}, {'title': 1}) for cand in itr: add_title_for_similar_search((cand['title'], cand['_id'])) init_title_for_similar_search() if __name__ == '__main__': print search_exact(title_beautify('Intriguing properties of neural networks'))
def search(ctx): query = ctx.query.lower() ret = {} ret['ctx_update'] = {} srs = [] r = requests.get(GOOGLE_SCHOLAR_URL.format(query)) text = r.text.encode('utf-8') #with open('/tmp/b.html', 'r') as f: #text = f.read() def find_citecnt(dom): try: find = dom.findAll(attrs={'class': 'gs_ri'})[0] find = find.findAll(attrs={'class': 'gs_fl'})[0] find = find.findAll('a')[0].text cnt = re.search('[0-9]+', find).group() return int(cnt) except: return None soup = BeautifulSoup(text, BS_PARSER) results = soup.findAll(attrs={'class': 'gs_r'}) title_updated = None for rst in results: try: h3 = rst.findAll('h3')[0] real_title = h3.get_text() real_title = filter_title_fileformat(real_title) tc = title_correct(query, real_title) if not tc[0]: continue if not title_updated and tc[1]: title_updated = ensure_unicode(title_beautify(real_title)) while True: # fix things like '[citation][c] Title' new_title = re.sub('^\[[^\]]*\]', '', title_updated).strip() if new_title == title_updated: title_updated = new_title break title_updated = new_title log_info(u"Title updated: {0}".format(title_updated)) ret['ctx_update']['title'] = title_updated cnt = find_citecnt(rst) if cnt is not None: ret['ctx_update']['citecnt'] = cnt try: url = str(h3.find('a').get('href')) srs.append(SearchResult(None, url)) except: pass findpdf = rst.findAll(attrs={'class': 'gs_ggs'}) if findpdf: pdflink = findpdf[0].find('a').get('href') url = str(pdflink) srs.append(SearchResult('directpdf', url)) except Exception as e: log_exc("Search Item parse error: {0}".format(str(e))) ret['results'] = srs return ret
def handle_title_query(query): query = title_beautify(query) log_info("Get title query: {0}".format(query)) #starts search res = search_startswith(query) # and the idf is large if res: log_info("Found {0} results in db: {1}".format( len(res), str([x['_id'] for x in res]))) return res # similar search res = similar_search(query) if res: log_info(u"Found similar results in db: {0}".format(res['_id'])) return [res] # search on web searchers = searcher.register_searcher.get_searcher_list() parsers = fetcher.register_parser.get_parser_list() ctx = JobContext(query) args = zip(searchers, [ctx] * len(searchers)) pool = Pool() async_results = [pool.apply_async(searcher_run, arg) for arg in args] # Search and get all the results item all_search_results = [] for s in async_results: s = s.get(ukconfig.PYTHON_POOL_TIMEOUT) if s is None: continue srs = s['results'] # try search database with updated title try: updated_title = s['ctx_update']['title'] except KeyError: pass else: if updated_title != query: query = updated_title res = search_exact(query) if res: log_info("Found {0} results in db: {1}".format( len(res), str([x['_id'] for x in res]))) return res all_search_results.extend(srs) meta = s.get('ctx_update') if meta: log_info('Meat update from searcher: {0}'.format(str(meta.keys()))) ctx.update_meta_dict(meta) pool.close() pool.terminate() # Analyse each result and try to parse info download_candidates = [] parser_used = set() found = False for sr in all_search_results: for parser in parsers: if parser.can_handle(sr): download_candidates.append((parser, sr)) if ctx.need_field(parser.support_meta_field): # Already tried this fetcher if not parser.repeatable and \ parser.name in parser_used: continue else: parser_used.add(parser.name) succ = parser.fetch_info(ctx, sr) if not succ: continue found = True if ctx.existing is not None: log_info("Found {0} results in db".format( len(ctx.existing))) return [ctx.existing] # no metadata or downloadable source found if not found and len(download_candidates) == 0: return None # Save data, return data and start downloading try: pid = new_paper(ctx) ret = [{ '_id': pid, 'title': ctx.title, 'view_cnt': 1, 'download_cnt': 0 }] ret[0].update(ctx.meta) progress_dict[pid] = 0.0 if len(download_candidates) > 0: thread = Thread(target=start_download, args=(download_candidates, ctx, pid)) thread.start() return ret except: log_exc("Failed to save to db")
def handle_title_query(query): query = title_beautify(query) log_info("Get title query: {0}".format(query)) #starts search res = search_startswith(query) # and the idf is large if res: log_info("Found {0} results in db: {1}".format( len(res), str([x['_id'] for x in res]))) return res # similar search res = similar_search(query) if res: log_info(u"Found similar results in db: {0}".format(res['_id'])) return [res] # search on web searchers = searcher.register_searcher.get_searcher_list() parsers = fetcher.register_parser.get_parser_list() ctx = JobContext(query) args = zip(searchers, [ctx] * len(searchers)) pool = Pool() async_results = [pool.apply_async(searcher_run, arg) for arg in args] # Search and get all the results item all_search_results = [] for s in async_results: s = s.get(ukconfig.PYTHON_POOL_TIMEOUT) if s is None: continue srs = s['results'] # try search database with updated title try: updated_title = s['ctx_update']['title'] except KeyError: pass else: if updated_title != query: query = updated_title res = search_exact(query) if res: log_info("Found {0} results in db: {1}".format( len(res), str([x['_id'] for x in res]))) return res all_search_results.extend(srs) meta = s.get('ctx_update') if meta: log_info('Meat update from searcher: {0}'.format(str(meta.keys()))) ctx.update_meta_dict(meta) pool.close() pool.terminate() # Analyse each result and try to parse info download_candidates = [] parser_used = set() found = False for sr in all_search_results: for parser in parsers: if parser.can_handle(sr): download_candidates.append((parser, sr)) if ctx.need_field(parser.support_meta_field): # Already tried this fetcher if not parser.repeatable and \ parser.name in parser_used: continue else: parser_used.add(parser.name) succ = parser.fetch_info(ctx, sr) if not succ: continue found = True if ctx.existing is not None: log_info("Found {0} results in db".format(len(ctx.existing))) return [ctx.existing] # no metadata or downloadable source found if not found and len(download_candidates) == 0: return None # Save data, return data and start downloading try: pid = new_paper(ctx) ret = [{'_id': pid, 'title': ctx.title, 'view_cnt': 1, 'download_cnt': 0 }] ret[0].update(ctx.meta) progress_dict[pid] = 0.0 if len(download_candidates) > 0: thread = Thread(target=start_download, args=(download_candidates, ctx, pid)) thread.start() return ret except: log_exc("Failed to save to db")