def collections(self, req, form): """Collections statistics page""" argd = wash_urlargd(form, {'collection': (str, "All"), 'timespan': (str, "this month"), 's_date': (str, ""), 'f_date': (str, ""), 'format': (str, "flot"), 'ln': (str, CFG_SITE_LANG)}) ln = argd['ln'] user_info = collect_user_info(req) (auth_code, auth_msg) = acc_authorize_action(user_info, 'runwebstatadmin') if auth_code: return page_not_authorized(req, navtrail=self.navtrail % {'ln_link': (ln != CFG_SITE_LANG and '?ln=' + ln) or ''}, navmenuid='collections', text=auth_msg, ln=ln) if collection_restricted_p(argd['collection']): (auth_code_coll, auth_msg_coll) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=argd['collection']) if auth_code_coll: return page_not_authorized(req, navmenuid='collections', text=auth_msg_coll, ln=ln) return page(title="Statistics of %s" % argd['collection'], body=perform_display_stats_per_coll(argd, req, ln=ln), navtrail="""<a class="navtrail" href="%s/stats/%s">Statistics</a>""" % \ (CFG_SITE_URL, (ln != CFG_SITE_LANG and '?ln=' + ln) or ''), description="CDS, Statistics, Collection %s" % argd['collection'], keywords="CDS, statistics, %s" % argd['collection'], req=req, lastupdated=__lastupdated__, navmenuid='collections', language=ln)
def restricted_p(self): """Predicate to test if the collection is restricted or not. Return the contect of the `restrited' column of the collection table (typically Apache group). Otherwise return None if the collection is public.""" if collection_restricted_p(self.name): return 1 return None
def __call__(self, req, form): argd = wash_search_urlargd(form) argd['recid'] = self.recid if self.format is not None: argd['of'] = self.format req.argd = argd uid = getUid(req) user_info = collect_user_info(req) if uid == -1: return page_not_authorized(req, "../", text="You are not authorized to view this record.", navmenuid='search') elif uid > 0: pref = get_user_preferences(uid) try: if not form.has_key('rg'): # fetch user rg preference only if not overridden via URL argd['rg'] = int(pref['websearch_group_records']) except (KeyError, ValueError): pass if argd['rg'] > CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS and acc_authorize_action(req, 'runbibedit')[0] != 0: argd['rg'] = CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS #check if the user has rights to set a high wildcard limit #if not, reduce the limit set by user, with the default one if CFG_WEBSEARCH_WILDCARD_LIMIT > 0 and (argd['wl'] > CFG_WEBSEARCH_WILDCARD_LIMIT or argd['wl'] == 0): if acc_authorize_action(req, 'runbibedit')[0] != 0: argd['wl'] = CFG_WEBSEARCH_WILDCARD_LIMIT # only superadmins can use verbose parameter for obtaining debug information if not isUserSuperAdmin(user_info): argd['verbose'] = 0 record_primary_collection = guess_primary_collection_of_a_record(self.recid) if collection_restricted_p(record_primary_collection): (auth_code, dummy) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=record_primary_collection) if auth_code: return page_not_authorized(req, "../", text="You are not authorized to view this record.", navmenuid='search') # Keep all the arguments, they might be reused in the # record page itself to derivate other queries req.argd = argd # mod_python does not like to return [] in case when of=id: out = perform_request_search(req, **argd) if isinstance(out, intbitset): return out.fastdump() elif out == []: return str(out) else: return out
def authenticate(self, req, form): """Restricted search results pages.""" argd = wash_search_urlargd(form) user_info = collect_user_info(req) for coll in argd['c'] + [argd['cc']]: if collection_restricted_p(coll): (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=coll) if auth_code and user_info['email'] == 'guest': cookie = mail_cookie_create_authorize_action( VIEWRESTRCOLL, {'collection': coll}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target, norobot=True) elif auth_code: return page_not_authorized(req, "../", \ text=auth_msg, \ navmenuid='search') #check if the user has rights to set a high wildcard limit #if not, reduce the limit set by user, with the default one if CFG_WEBSEARCH_WILDCARD_LIMIT > 0 and ( argd['wl'] > CFG_WEBSEARCH_WILDCARD_LIMIT or argd['wl'] == 0): auth_code, auth_message = acc_authorize_action(req, 'runbibedit') if auth_code != 0: argd['wl'] = CFG_WEBSEARCH_WILDCARD_LIMIT # only superadmins can use verbose parameter for obtaining debug information if not isUserSuperAdmin(user_info): argd['verbose'] = 0 # Keep all the arguments, they might be reused in the # search_engine itself to derivate other queries req.argd = argd uid = getUid(req) if uid > 0: pref = get_user_preferences(uid) try: if not form.has_key('rg'): # fetch user rg preference only if not overridden via URL argd['rg'] = int(pref['websearch_group_records']) except (KeyError, ValueError): pass # mod_python does not like to return [] in case when of=id: out = perform_request_search(req, **argd) if isinstance(out, intbitset): return out.fastdump() elif out == []: return str(out) else: return out
def authenticate(self, req, form): """Restricted search results pages.""" argd = wash_search_urlargd(form) user_info = collect_user_info(req) for coll in argd['c'] + [argd['cc']]: if collection_restricted_p(coll): (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=coll) if auth_code and user_info['email'] == 'guest': cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : coll}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target, norobot=True) elif auth_code: return page_not_authorized(req, "../", \ text=auth_msg, \ navmenuid='search') #check if the user has rights to set a high wildcard limit #if not, reduce the limit set by user, with the default one if CFG_WEBSEARCH_WILDCARD_LIMIT > 0 and (argd['wl'] > CFG_WEBSEARCH_WILDCARD_LIMIT or argd['wl'] == 0): auth_code, auth_message = acc_authorize_action(req, 'runbibedit') if auth_code != 0: argd['wl'] = CFG_WEBSEARCH_WILDCARD_LIMIT # only superadmins can use verbose parameter for obtaining debug information if not isUserSuperAdmin(user_info): argd['verbose'] = 0 # Keep all the arguments, they might be reused in the # search_engine itself to derivate other queries req.argd = argd uid = getUid(req) if uid > 0: pref = get_user_preferences(uid) try: if not form.has_key('rg'): # fetch user rg preference only if not overridden via URL argd['rg'] = int(pref['websearch_group_records']) except (KeyError, ValueError): pass # mod_python does not like to return [] in case when of=id: out = perform_request_search(req, **argd) if isinstance(out, intbitset): return out.fastdump() elif out == []: return str(out) else: return out
def collections(self, req, form): """Collections statistics page""" argd = wash_urlargd(form, { 'coll': (str, "All"), 'ln': (str, CFG_SITE_LANG) }) ln = argd['ln'] user_info = collect_user_info(req) (auth_code, auth_msg) = acc_authorize_action(user_info, 'runwebstatadmin') if auth_code: return page_not_authorized( req, navtrail=self.navtrail % {'ln_link': (ln != CFG_SITE_LANG and '?ln=' + ln) or ''}, navmenuid='collections', text=auth_msg, ln=ln) if collection_restricted_p(argd['coll']): (auth_code_coll, auth_msg_coll) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=argd['coll']) if auth_code_coll: return page_not_authorized(req, navmenuid='collections', text=auth_msg_coll, ln=ln) return page(title="Statistics of %s" % argd['coll'], body=perform_display_stats_per_coll(argd['coll'], req, ln=ln), navtrail="""<a class="navtrail" href="%s/stats/%s">Statistics</a>""" % \ (CFG_SITE_URL, (ln != CFG_SITE_LANG and '?ln=' + ln) or ''), description="CDS, Statistics, Collection %s" % argd['coll'], keywords="CDS, statistics, %s" % argd['coll'], req=req, lastupdated=__lastupdated__, navmenuid='collections', language=ln)
def search(self, read_cache=True, **kwparams): """ Returns records corresponding to the given search query. See docstring of invenio.search_engine.perform_request_search() for an overview of available parameters. @raise InvenioConnectorAuthError: if authentication fails """ parse_results = False of = kwparams.get('of', "") if of == "": parse_results = True of = "xm" kwparams['of'] = of params = urllib.urlencode(kwparams, doseq=1) # Are we running locally? If so, better directly access the # search engine directly if self.local and of != 't': # See if user tries to search any restricted collection c = kwparams.get('c', "") if c != "": if type(c) is list: colls = c else: colls = [c] for collection in colls: if collection_restricted_p(collection): if self.user: self._check_credentials() continue raise InvenioConnectorAuthError("You are trying to search a restricted collection. Please authenticate yourself.\n") kwparams['of'] = 'id' results = perform_request_search(**kwparams) if of.lower() != 'id': results = format_records(results, of) else: if not self.cached_queries.has_key(params + str(parse_results)) or not read_cache: if self.user: results = self.browser.open(self.server_url + "/search?" + params) else: results = urllib2.urlopen(self.server_url + "/search?" + params) if 'youraccount/login' in results.geturl(): # Current user not able to search collection raise InvenioConnectorAuthError("You are trying to search a restricted collection. Please authenticate yourself.\n") else: return self.cached_queries[params + str(parse_results)] if parse_results: # FIXME: we should not try to parse if results is string parsed_records = self._parse_results(results, self.cached_records) self.cached_queries[params + str(parse_results)] = parsed_records return parsed_records else: # pylint: disable=E1103 # The whole point of the following code is to make sure we can # handle two types of variable. try: res = results.read() except AttributeError: res = results # pylint: enable=E1103 if of == "id": try: if type(res) is str: # Transform to list res = [int(recid.strip()) for recid in \ res.strip("[]").split(",") if recid.strip() != ""] res.reverse() except (ValueError, AttributeError): res = [] self.cached_queries[params + str(parse_results)] = res return self.cached_queries[params + str(parse_results)]
def __call__(self, req, form): """RSS 2.0 feed service.""" # Keep only interesting parameters for the search default_params = websearch_templates.rss_default_urlargd # We need to keep 'jrec' and 'rg' here in order to have # 'multi-page' RSS. These parameters are not kept be default # as we don't want to consider them when building RSS links # from search and browse pages. default_params.update({'jrec':(int, 1), 'rg': (int, CFG_WEBSEARCH_INSTANT_BROWSE_RSS)}) argd = wash_urlargd(form, default_params) user_info = collect_user_info(req) for coll in argd['c'] + [argd['cc']]: if collection_restricted_p(coll): (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=coll) if auth_code and user_info['email'] == 'guest': cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : coll}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target, norobot=True) elif auth_code: return page_not_authorized(req, "../", \ text=auth_msg, \ navmenuid='search') # Create a standard filename with these parameters current_url = websearch_templates.build_rss_url(argd) cache_filename = current_url.split('/')[-1] # In the same way as previously, add 'jrec' & 'rg' req.content_type = "application/rss+xml" req.send_http_header() try: # Try to read from cache path = "%s/rss/%s.xml" % (CFG_CACHEDIR, cache_filename) # Check if cache needs refresh filedesc = open(path, "r") last_update_time = datetime.datetime.fromtimestamp(os.stat(os.path.abspath(path)).st_mtime) assert(datetime.datetime.now() < last_update_time + datetime.timedelta(minutes=CFG_WEBSEARCH_RSS_TTL)) c_rss = filedesc.read() filedesc.close() req.write(c_rss) return except Exception, e: # do it live and cache previous_url = None if argd['jrec'] > 1: prev_jrec = argd['jrec'] - argd['rg'] if prev_jrec < 1: prev_jrec = 1 previous_url = websearch_templates.build_rss_url(argd, jrec=prev_jrec) #check if the user has rights to set a high wildcard limit #if not, reduce the limit set by user, with the default one if CFG_WEBSEARCH_WILDCARD_LIMIT > 0 and (argd['wl'] > CFG_WEBSEARCH_WILDCARD_LIMIT or argd['wl'] == 0): if acc_authorize_action(req, 'runbibedit')[0] != 0: argd['wl'] = CFG_WEBSEARCH_WILDCARD_LIMIT req.argd = argd recIDs = perform_request_search(req, of="id", c=argd['c'], cc=argd['cc'], p=argd['p'], f=argd['f'], p1=argd['p1'], f1=argd['f1'], m1=argd['m1'], op1=argd['op1'], p2=argd['p2'], f2=argd['f2'], m2=argd['m2'], op2=argd['op2'], p3=argd['p3'], f3=argd['f3'], m3=argd['m3'], wl=argd['wl']) nb_found = len(recIDs) next_url = None if len(recIDs) >= argd['jrec'] + argd['rg']: next_url = websearch_templates.build_rss_url(argd, jrec=(argd['jrec'] + argd['rg'])) first_url = websearch_templates.build_rss_url(argd, jrec=1) last_url = websearch_templates.build_rss_url(argd, jrec=nb_found - argd['rg'] + 1) recIDs = recIDs[-argd['jrec']:(-argd['rg'] - argd['jrec']):-1] rss_prologue = '<?xml version="1.0" encoding="UTF-8"?>\n' + \ websearch_templates.tmpl_xml_rss_prologue(current_url=current_url, previous_url=previous_url, next_url=next_url, first_url=first_url, last_url=last_url, nb_found=nb_found, jrec=argd['jrec'], rg=argd['rg'], cc=argd['cc']) + '\n' req.write(rss_prologue) rss_body = format_records(recIDs, of='xr', ln=argd['ln'], user_info=user_info, record_separator="\n", req=req, epilogue="\n") rss_epilogue = websearch_templates.tmpl_xml_rss_epilogue() + '\n' req.write(rss_epilogue) # update cache dirname = "%s/rss" % (CFG_CACHEDIR) mymkdir(dirname) fullfilename = "%s/rss/%s.xml" % (CFG_CACHEDIR, cache_filename) try: # Remove the file just in case it already existed # so that a bit of space is created os.remove(fullfilename) except OSError: pass # Check if there's enough space to cache the request. if len(os.listdir(dirname)) < CFG_WEBSEARCH_RSS_MAX_CACHED_REQUESTS: try: os.umask(022) f = open(fullfilename, "w") f.write(rss_prologue + rss_body + rss_epilogue) f.close() except IOError, v: if v[0] == 36: # URL was too long. Never mind, don't cache pass else: raise repr(v)
def __call__(self, req, form): """ Perform a search. """ argd = wash_search_urlargd(form) _ = gettext_set_language(argd['ln']) if req.method == 'POST': raise apache.SERVER_RETURN, apache.HTTP_METHOD_NOT_ALLOWED uid = getUid(req) user_info = collect_user_info(req) if uid == -1: return page_not_authorized(req, "../", text=_("You are not authorized to view this area."), navmenuid='search') elif uid > 0: pref = get_user_preferences(uid) try: if not form.has_key('rg'): # fetch user rg preference only if not overridden via URL argd['rg'] = int(pref['websearch_group_records']) except (KeyError, ValueError): pass if argd['rg'] > CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS and acc_authorize_action(req, 'runbibedit')[0] != 0: argd['rg'] = CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS involved_collections = set() involved_collections.update(argd['c']) involved_collections.add(argd['cc']) if argd['id'] > 0: argd['recid'] = argd['id'] if argd['idb'] > 0: argd['recidb'] = argd['idb'] if argd['sysno']: tmp_recid = find_record_from_sysno(argd['sysno']) if tmp_recid: argd['recid'] = tmp_recid if argd['sysnb']: tmp_recid = find_record_from_sysno(argd['sysnb']) if tmp_recid: argd['recidb'] = tmp_recid if argd['recid'] > 0: if argd['recidb'] > argd['recid']: # Hack to check if among the restricted collections # at least a record of the range is there and # then if the user is not authorized for that # collection. recids = intbitset(xrange(argd['recid'], argd['recidb'])) restricted_collection_cache.recreate_cache_if_needed() for collname in restricted_collection_cache.cache: (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=collname) if auth_code and user_info['email'] == 'guest': coll_recids = get_collection(collname).reclist if coll_recids & recids: cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : collname}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target, norobot=True) elif auth_code: return page_not_authorized(req, "../", \ text=auth_msg, \ navmenuid='search') else: involved_collections.add(guess_primary_collection_of_a_record(argd['recid'])) # If any of the collection requires authentication, redirect # to the authentication form. for coll in involved_collections: if collection_restricted_p(coll): (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=coll) if auth_code and user_info['email'] == 'guest': cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : coll}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target, norobot=True) elif auth_code: return page_not_authorized(req, "../", \ text=auth_msg, \ navmenuid='search') #check if the user has rights to set a high wildcard limit #if not, reduce the limit set by user, with the default one if CFG_WEBSEARCH_WILDCARD_LIMIT > 0 and (argd['wl'] > CFG_WEBSEARCH_WILDCARD_LIMIT or argd['wl'] == 0): auth_code, auth_message = acc_authorize_action(req, 'runbibedit') if auth_code != 0: argd['wl'] = CFG_WEBSEARCH_WILDCARD_LIMIT # only superadmins can use verbose parameter for obtaining debug information if not isUserSuperAdmin(user_info): argd['verbose'] = 0 # Keep all the arguments, they might be reused in the # search_engine itself to derivate other queries req.argd = argd # mod_python does not like to return [] in case when of=id: out = perform_request_search(req, **argd) if out == []: return str(out) else: return out
def search(self, p="", f="", c="", rg=10, sf="", so="d", sp="", rm="", of="", ot="", p1="", f1="", m1="", op1="", p2="", f2="", m2="", op2="", p3="", f3="", m3="", jrec=0, recid=-1, recidb=-1, d1="", d1y=0, d1m=0, d1d=0, d2="", d2y=0, d2m=0, d2d=0, dt="", ap=0, read_cache=True): """ Returns records corresponding to the given search query. @raise InvenioConnectorAuthError: if authentication fails """ parse_results = False if of == "": parse_results = True of = "xm" params = {'p': p, 'f': f, 'c': c, 'rg': rg, 'sf': sf, 'so': so, 'sp': sp, 'rm': rm, 'of': of, 'p1':p1, 'f1': f1, 'm1': m1, 'op1': op1, 'p2': p2, 'f2': f2, 'm2': m2, 'op2': op2, 'p3': p3, 'f3': f3, 'm3': m3, 'jrec':jrec, 'd1': d1, 'd1y':d1y, 'd1m': d1m, 'd1d': d1d, 'd2': d2, 'd2y': d2y, 'd2m': d2m, 'd2d': d2d, 'dt': dt, 'ap': ap , 'recid': recid, 'recidb': recidb, 'ot': ot} if recid == -1: del params['recid'] if recidb == -1: del params['recidb'] params = urllib.urlencode(params, doseq=1) # Are we running locally? If so, better directly access the # search engine directly if self.server_url in LOCAL_SITE_URLS and of != 't': # See if user tries to search any restricted collection if c != "": if type(c) is list: colls = c else: colls = [c] for collection in colls: if collection_restricted_p(collection): if self.user: self._check_credentials() continue raise InvenioConnectorAuthError("You are trying to search a restricted collection. Please authenticate yourself.\n") results = perform_request_search(p=p, f=f, c=c, rg=rg, sf=sf, so=so, sp=so, rm=rm, p1=p1, f1=f1, m1=m1, op1=op1, p2=p2, f2=f2, m2=m2, op2=op2, p3=p3, f3=f3, m3=m3, jrec=jrec, recid=recid, recidb=recidb, of='id', ot=ot, d1=d1, d1y=d1y, d1m=d1m, d1d=d1d, d2=d2, d2y=d2y, d2m=d2m, d2d=d2d, dt=dt, ap=ap) if of.lower() != 'id': results = format_records(results, of) else: if not self.cached_queries.has_key(params + str(parse_results)) or not read_cache: if self.user: results = self.browser.open(self.server_url + "/search?" + params) else: results = urllib2.urlopen(self.server_url + "/search?" + params) if 'youraccount/login' in results.geturl(): # Current user not able to search collection raise InvenioConnectorAuthError("You are trying to search a restricted collection. Please authenticate yourself.\n") else: return self.cached_queries[params + str(parse_results)] if parse_results: # FIXME: we should not try to parse if results is string parsed_records = self._parse_results(results, self.cached_records) self.cached_queries[params + str(parse_results)] = parsed_records return parsed_records else: # pylint: disable=E1103 # The whole point of the following code is to make sure we can # handle two types of variable. try: res = results.read() except AttributeError: res = results # pylint: enable=E1103 if of == "id": try: if type(res) is str: # Transform to list res = [int(recid.strip()) for recid in \ res.strip("[]").split(",") if recid.strip() != ""] res.reverse() except (ValueError, AttributeError): res = [] self.cached_queries[params + str(parse_results)] = res return self.cached_queries[params + str(parse_results)]
def answer(self, req, user_info, of, cc, colls_to_search, p, f, search_units, ln): """ Answer question given by context. Return (relevance, html_string) where relevance is integer from 0 to 100 indicating how relevant to the question the answer is (see C{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) , and html_string being a formatted answer. """ from invenio.search_engine import \ get_permitted_restricted_collections, \ get_coll_i18nname, \ collection_i18nname_cache, \ collection_restricted_p _ = gettext_set_language(ln) # stem search units. remove those with field # TODO: search in hosted collection names too # TODO: ignore unattached trees # TODO: use synonyms if f or (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH < 0) or \ (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH == 0 and cc != CFG_SITE_NAME): return (0, '') words = [ stem(unit[1], ln) for unit in search_units if unit[2] in ('', 'collection') ] # Stemming if not words: return (0, '') permitted_restricted_collections = get_permitted_restricted_collections( user_info) cache = self.get_data_cache() matching_collections = {} for word in words: if CFG_CERN_SITE and word == 'cern': # This keyword is useless here... continue colls = cache.get(word.lower(), []) for coll in colls: if collection_restricted_p(coll) and \ not coll in permitted_restricted_collections: # Skip restricted collection user do not have access continue if not matching_collections.has_key(coll): matching_collections[coll] = 0 matching_collections[coll] += 1 matching_collections_sorted = sorted(matching_collections.iteritems(), key=lambda (k, v): (v, k), reverse=True) if not matching_collections_sorted: return (0, '') matching_collections_names = [(get_coll_i18nname(coll, ln, False), CFG_SITE_URL + '/collection/' + urllib.quote(coll, safe='') + '?ln=en') \ for coll, score in matching_collections_sorted] best_score = matching_collections_sorted[0][1] best_coll_words = whitespace_re.split( matching_collections_sorted[0][0]) relevance = min( 100, max(0, (100 * float(2 * best_score) / float(len(best_coll_words) + len(words)) - 10))) if (('submit' in p.lower()) or (_('submit') in p.lower())) and \ not (('submit' in best_coll_words) or (_('submit') in best_coll_words)): # User is probably looking for a submission. Decrease relevance relevance = max(0, relevance - 30) return (relevance, self.display_answer_helper(matching_collections_names, ln))
def search(self, p="", f="", c="", rg=10, sf="", so="d", sp="", rm="", of="", ot="", p1="", f1="", m1="", op1="", p2="", f2="", m2="", op2="", p3="", f3="", m3="", jrec=0, recid=-1, recidb=-1, d1="", d1y=0, d1m=0, d1d=0, d2="", d2y=0, d2m=0, d2d=0, dt="", ap=0, read_cache=True): """ Returns records corresponding to the given search query. @raise InvenioConnectorAuthError: if authentication fails """ parse_results = False if of == "": parse_results = True of = "xm" params = { 'p': p, 'f': f, 'c': c, 'rg': rg, 'sf': sf, 'so': so, 'sp': sp, 'rm': rm, 'of': of, 'p1': p1, 'f1': f1, 'm1': m1, 'op1': op1, 'p2': p2, 'f2': f2, 'm2': m2, 'op2': op2, 'p3': p3, 'f3': f3, 'm3': m3, 'jrec': jrec, 'd1': d1, 'd1y': d1y, 'd1m': d1m, 'd1d': d1d, 'd2': d2, 'd2y': d2y, 'd2m': d2m, 'd2d': d2d, 'dt': dt, 'ap': ap, 'recid': recid, 'recidb': recidb, 'ot': ot } if recid == -1: del params['recid'] if recidb == -1: del params['recidb'] params = urllib.urlencode(params, doseq=1) # Are we running locally? If so, better directly access the # search engine directly if self.local and of != 't': # See if user tries to search any restricted collection if c != "": if type(c) is list: colls = c else: colls = [c] for collection in colls: if collection_restricted_p(collection): if self.user: self._check_credentials() continue raise InvenioConnectorAuthError( "You are trying to search a restricted collection. Please authenticate yourself.\n" ) results = perform_request_search(p=p, f=f, c=c, rg=rg, sf=sf, so=so, sp=so, rm=rm, p1=p1, f1=f1, m1=m1, op1=op1, p2=p2, f2=f2, m2=m2, op2=op2, p3=p3, f3=f3, m3=m3, jrec=jrec, recid=recid, recidb=recidb, of='id', ot=ot, d1=d1, d1y=d1y, d1m=d1m, d1d=d1d, d2=d2, d2y=d2y, d2m=d2m, d2d=d2d, dt=dt, ap=ap) if of.lower() != 'id': results = format_records(results, of) else: if not self.cached_queries.has_key( params + str(parse_results)) or not read_cache: if self.user: results = self.browser.open(self.server_url + "/search?" + params) else: results = urllib2.urlopen(self.server_url + "/search?" + params) if 'youraccount/login' in results.geturl(): # Current user not able to search collection raise InvenioConnectorAuthError( "You are trying to search a restricted collection. Please authenticate yourself.\n" ) else: return self.cached_queries[params + str(parse_results)] if parse_results: # FIXME: we should not try to parse if results is string parsed_records = self._parse_results(results, self.cached_records) self.cached_queries[params + str(parse_results)] = parsed_records return parsed_records else: # pylint: disable=E1103 # The whole point of the following code is to make sure we can # handle two types of variable. try: res = results.read() except AttributeError: res = results # pylint: enable=E1103 if of == "id": try: if type(res) is str: # Transform to list res = [int(recid.strip()) for recid in \ res.strip("[]").split(",") if recid.strip() != ""] res.reverse() except (ValueError, AttributeError): res = [] self.cached_queries[params + str(parse_results)] = res return self.cached_queries[params + str(parse_results)]
def __call__(self, req, form): """ Perform a search. """ argd = wash_search_urlargd(form) _ = gettext_set_language(argd['ln']) if req.method == 'POST': raise apache.SERVER_RETURN, apache.HTTP_METHOD_NOT_ALLOWED uid = getUid(req) user_info = collect_user_info(req) if uid == -1: return page_not_authorized(req, "../", text=_("You are not authorized to view this area."), navmenuid='search') elif uid > 0: pref = get_user_preferences(uid) try: if not form.has_key('rg'): # fetch user rg preference only if not overridden via URL argd['rg'] = int(pref['websearch_group_records']) except (KeyError, ValueError): pass if argd['rg'] > CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS and acc_authorize_action(req, 'runbibedit')[0] != 0: argd['rg'] = CFG_WEBSEARCH_MAX_RECORDS_IN_GROUPS involved_collections = set() involved_collections.update(argd['c']) involved_collections.add(argd['cc']) if argd['id'] > 0: argd['recid'] = argd['id'] if argd['idb'] > 0: argd['recidb'] = argd['idb'] if argd['sysno']: tmp_recid = find_record_from_sysno(argd['sysno']) if tmp_recid: argd['recid'] = tmp_recid if argd['sysnb']: tmp_recid = find_record_from_sysno(argd['sysnb']) if tmp_recid: argd['recidb'] = tmp_recid if argd['recid'] > 0: if argd['recidb'] > argd['recid']: # Hack to check if among the restricted collections # at least a record of the range is there and # then if the user is not authorized for that # collection. recids = intbitset(xrange(argd['recid'], argd['recidb'])) restricted_collection_cache.recreate_cache_if_needed() for collname in restricted_collection_cache.cache: (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=collname) if auth_code and user_info['email'] == 'guest': coll_recids = get_collection(collname).reclist if coll_recids & recids: cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : collname}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target, norobot=True) elif auth_code: return page_not_authorized(req, "../", \ text=auth_msg, \ navmenuid='search') else: involved_collections.add(guess_primary_collection_of_a_record(argd['recid'])) # If any of the collection requires authentication, redirect # to the authentication form. for coll in involved_collections: if collection_restricted_p(coll): (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=coll) if auth_code and user_info['email'] == 'guest': cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : coll}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target, norobot=True) elif auth_code: return page_not_authorized(req, "../", \ text=auth_msg, \ navmenuid='search') #check if the user has rights to set a high wildcard limit #if not, reduce the limit set by user, with the default one if CFG_WEBSEARCH_WILDCARD_LIMIT > 0 and (argd['wl'] > CFG_WEBSEARCH_WILDCARD_LIMIT or argd['wl'] == 0): auth_code, auth_message = acc_authorize_action(req, 'runbibedit') if auth_code != 0: argd['wl'] = CFG_WEBSEARCH_WILDCARD_LIMIT # only superadmins can use verbose parameter for obtaining debug information if not isUserSuperAdmin(user_info): argd['verbose'] = 0 # Keep all the arguments, they might be reused in the # search_engine itself to derivate other queries req.argd = argd # mod_python does not like to return [] in case when of=id: out = perform_request_search(req, **argd) if isinstance(out, intbitset): return out.fastdump() elif out == []: return str(out) else: return out
def answer(self, req, user_info, of, cc, colls_to_search, p, f, search_units, ln): """ Answer question given by context. Return (relevance, html_string) where relevance is integer from 0 to 100 indicating how relevant to the question the answer is (see C{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) , and html_string being a formatted answer. """ from invenio.search_engine import \ get_permitted_restricted_collections, \ get_coll_i18nname, \ collection_i18nname_cache, \ collection_restricted_p _ = gettext_set_language(ln) # stem search units. remove those with field # TODO: search in hosted collection names too # TODO: ignore unattached trees # TODO: use synonyms if f or (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH < 0) or \ (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH == 0 and cc != CFG_SITE_NAME): return (0, '') words = [stem(unit[1], ln) for unit in search_units if unit[2] in ('', 'collection')] # Stemming if not words: return (0, '') permitted_restricted_collections = get_permitted_restricted_collections(user_info) cache = self.get_data_cache() matching_collections = {} for word in words: if CFG_CERN_SITE and word == 'cern': # This keyword is useless here... continue colls = cache.get(word.lower(), []) for coll in colls: if collection_restricted_p(coll) and \ not coll in permitted_restricted_collections: # Skip restricted collection user do not have access continue if not matching_collections.has_key(coll): matching_collections[coll] = 0 matching_collections[coll] += 1 matching_collections_sorted = sorted(matching_collections.iteritems(), key=lambda (k, v): (v, k), reverse=True) if not matching_collections_sorted: return (0, '') matching_collections_names = [(get_coll_i18nname(coll, ln, False), CFG_SITE_URL + '/collection/' + urllib.quote(coll, safe='') + '?ln=en') \ for coll, score in matching_collections_sorted] best_score = matching_collections_sorted[0][1] best_coll_words = whitespace_re.split(matching_collections_sorted[0][0]) relevance = min(100, max(0, (100 * float(2 * best_score) / float(len(best_coll_words) + len(words)) - 10))) if (('submit' in p.lower()) or (_('submit') in p.lower())) and \ not (('submit' in best_coll_words) or (_('submit') in best_coll_words)): # User is probably looking for a submission. Decrease relevance relevance = max(0, relevance - 30) return (relevance, self.display_answer_helper(matching_collections_names, ln))
def __call__(self, req, form): """RSS 2.0 feed service.""" # Keep only interesting parameters for the search default_params = websearch_templates.rss_default_urlargd # We need to keep 'jrec' and 'rg' here in order to have # 'multi-page' RSS. These parameters are not kept be default # as we don't want to consider them when building RSS links # from search and browse pages. default_params.update({'jrec':(int, 1), 'rg': (int, CFG_WEBSEARCH_INSTANT_BROWSE_RSS)}) argd = wash_urlargd(form, default_params) user_info = collect_user_info(req) for coll in argd['c'] + [argd['cc']]: if collection_restricted_p(coll): (auth_code, auth_msg) = acc_authorize_action(user_info, VIEWRESTRCOLL, collection=coll) if auth_code and user_info['email'] == 'guest': cookie = mail_cookie_create_authorize_action(VIEWRESTRCOLL, {'collection' : coll}) target = CFG_SITE_SECURE_URL + '/youraccount/login' + \ make_canonical_urlargd({'action': cookie, 'ln' : argd['ln'], 'referer' : CFG_SITE_SECURE_URL + req.unparsed_uri}, {}) return redirect_to_url(req, target, norobot=True) elif auth_code: return page_not_authorized(req, "../", \ text=auth_msg, \ navmenuid='search') # Create a standard filename with these parameters current_url = websearch_templates.build_rss_url(argd) cache_filename = current_url.split('/')[-1] # In the same way as previously, add 'jrec' & 'rg' req.content_type = "application/rss+xml" req.send_http_header() try: # Try to read from cache path = "%s/rss/%s.xml" % (CFG_CACHEDIR, cache_filename) # Check if cache needs refresh filedesc = open(path, "r") last_update_time = datetime.datetime.fromtimestamp(os.stat(os.path.abspath(path)).st_mtime) assert(datetime.datetime.now() < last_update_time + datetime.timedelta(minutes=CFG_WEBSEARCH_RSS_TTL)) c_rss = filedesc.read() filedesc.close() req.write(c_rss) return except Exception, e: # do it live and cache previous_url = None if argd['jrec'] > 1: prev_jrec = argd['jrec'] - argd['rg'] if prev_jrec < 1: prev_jrec = 1 previous_url = websearch_templates.build_rss_url(argd, jrec=prev_jrec) #check if the user has rights to set a high wildcard limit #if not, reduce the limit set by user, with the default one if CFG_WEBSEARCH_WILDCARD_LIMIT > 0 and (argd['wl'] > CFG_WEBSEARCH_WILDCARD_LIMIT or argd['wl'] == 0): if acc_authorize_action(req, 'runbibedit')[0] != 0: argd['wl'] = CFG_WEBSEARCH_WILDCARD_LIMIT req.argd = argd recIDs = perform_request_search(req, of="id", c=argd['c'], cc=argd['cc'], p=argd['p'], f=argd['f'], p1=argd['p1'], f1=argd['f1'], m1=argd['m1'], op1=argd['op1'], p2=argd['p2'], f2=argd['f2'], m2=argd['m2'], op2=argd['op2'], p3=argd['p3'], f3=argd['f3'], m3=argd['m3'], wl=argd['wl']) nb_found = len(recIDs) next_url = None if len(recIDs) >= argd['jrec'] + argd['rg']: next_url = websearch_templates.build_rss_url(argd, jrec=(argd['jrec'] + argd['rg'])) first_url = websearch_templates.build_rss_url(argd, jrec=1) last_url = websearch_templates.build_rss_url(argd, jrec=nb_found - argd['rg'] + 1) recIDs = recIDs[-argd['jrec']:(-argd['rg'] - argd['jrec']):-1] rss_prologue = '<?xml version="1.0" encoding="UTF-8"?>\n' + \ websearch_templates.tmpl_xml_rss_prologue(current_url=current_url, previous_url=previous_url, next_url=next_url, first_url=first_url, last_url=last_url, nb_found=nb_found, jrec=argd['jrec'], rg=argd['rg'], cc=argd['cc']) + '\n' req.write(rss_prologue) rss_body = format_records(recIDs, of='xr', ln=argd['ln'], user_info=user_info, record_separator="\n", req=req, epilogue="\n") rss_epilogue = websearch_templates.tmpl_xml_rss_epilogue() + '\n' req.write(rss_epilogue) # update cache dirname = "%s/rss" % (CFG_CACHEDIR) mymkdir(dirname) fullfilename = "%s/rss/%s.xml" % (CFG_CACHEDIR, cache_filename) try: # Remove the file just in case it already existed # so that a bit of space is created os.remove(fullfilename) except OSError: pass # Check if there's enough space to cache the request. if len(os.listdir(dirname)) < CFG_WEBSEARCH_RSS_MAX_CACHED_REQUESTS: try: os.umask(022) with open(fullfilename, "w") as fd: fd.write(rss_prologue + rss_body + rss_epilogue) except IOError as v: if v[0] == 36: # URL was too long. Never mind, don't cache pass else: raise
def is_restricted(self): from invenio.search_engine import collection_restricted_p return collection_restricted_p(self.name)