def show_node(req): """ opens administration window with content """ p = req.path[1:].split("/") style = req.params.get("style", u"") user = current_user v = {} v["user"] = user v["guestuser"] = get_guest_user().login_name v["version"] = core.__version__ v["content"] = show_content(req, p[0]) v["navigation"] = adminNavigation() v["breadcrumbs"] = getMenuItemID(v["navigation"], req.path[1:]) spc = [ Menu("sub_header_frontend", u"/"), Menu("sub_header_edit", u"/edit"), Menu("sub_header_logout", u"/logout") ] if user.is_workflow_editor: spc.append(Menu("sub_header_workflow", u"../publish/")) v["spc"] = spc if len(p) > 0: if style == "": req.writeTAL("web/admin/frame.html", v, macro="frame") else: req.write(v["content"])
def create(): """ Creates the sitemap files and the sitemap index files which are located at /web/root/ """ logging.getLogger('everything').info('Creating Sitemaps and Sitemap Index...') from core.users import get_guest_user base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) hostname = config.get('host.name') root = q(Collections).one() guest_user = get_guest_user() all_nodes = root.all_children_by_query(q(Node)).filter_read_access(user=guest_user) sitemaps = [] node_dict = {'collection': [], 'directory': [], 'document': [], 'dissertation': [], 'image': [], 'video': [], 'audio': [], } for node in all_nodes: # Arkitekt had a guest field that is actually not visible if node.has_read_access(user=guest_user): for node_type in node_dict.keys(): if node_type in q(Node).get(node.id).type: node_dict[node_type].append((unicode(node.id), q(Node).get(node.id).updatetime)) # Reassign node_dict to a dict where empty values were removed node_dict = dict((k, v) for k, v in node_dict.iteritems() if v) # Sitemap can have at most 50k entries for key in node_dict.keys(): if key in ('dissertation', 'document', 'image'): priority_level = '1.0' elif key == 'videos': priority_level = '0.8' else: priority_level = '0.5' # Create multiple sitemaps for node lists > 50k if len(node_dict[key]) > 50000: partitions = int(ceil((len(node_dict[key]) / 50000.))) for partition_number in range(partitions): sitemap = Sitemap(base_dir, ''.join(['sitemap-', str(key), str(partition_number), '.xml']), hostname) sitemaps.append(sitemap.name) sitemap.create_sitemap(node_dict[key][partition_number * 50000:(partition_number + 1) * 50000], priority_level) else: sitemap = Sitemap(base_dir, ''.join(['sitemap-', key, '.xml']), hostname) sitemaps.append(sitemap.name) sitemap.create_sitemap(node_dict[key], priority_level) siteindex = SitemapIndex(base_dir, 'sitemap-index.xml', hostname) now = '+'.join([datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), '02:00']) siteindex.create_sitemap_index(sitemaps, now) logging.getLogger('everything').info('Generation of Sitemaps and SitemapIndex Complete')
def has_access_to_node_id(node_id, accesstype, user=None, ip=None, date=None): # XXX: the database-independent code could move to core.node from core import db from core.users import get_guest_user if user is None: user = get_guest_user() if user.is_admin: return True if ip is None: ip = IPv4Address("0.0.0.0") if date is None: date = func.current_date() accessfunc = access_funcs[accesstype] group_ids = user.group_ids access = accessfunc(node_id, group_ids, ip, date) return db.session.execute(select([access])).scalar()
def GetRecord(req): if "identifier" in req.params: nid = identifier2id(req.params.get("identifier")) if nid is None: return writeError(req, "idDoesNotExist") else: return writeError(req, "badArgument") metadataformat = req.params.get("metadataPrefix", None) if not checkMetaDataFormat(metadataformat): return writeError(req, "badArgument") node = q(Node).get(nid) if node is None: return writeError(req, "idDoesNotExist") if metadataformat and (metadataformat.lower() in FORMAT_FILTERS.keys()) and not filterFormat(node, metadataformat.lower()): return writeError(req, "noPermission") if parentIsMedia(node): return writeError(req, "noPermission") if not node.has_read_access(user=get_guest_user()): return writeError(req, "noPermission") schema_name = node.getSchema() mask = get_oai_export_mask_for_schema_name_and_metadataformat(schema_name, metadataformat) req.write('<GetRecord>') writeRecord(req, node, metadataformat, mask=mask) req.write('</GetRecord>') if DEBUG: timetable_update(req, "leaving GetRecord")
def search_nodes(query, mapping_prefix='Z3950_search_'): """ Search nodes that match the query. 'query' is a tree of QueryBoolNode and QueryMatchNode objects. Query root nodes are configured by a naming convention. The names of mappings that starting with the given 'mapping_prefix' must end with a node ID, which is then used as root node for the search based on that field mapping. """ def get_root_for_mapping(mapping_node): name = mapping_node.name node_id = name[len(mapping_prefix):] node = q(Node).get(node_id) return node mapping_nodes = q(Mapping).filter(Mapping.name.startswith(mapping_prefix)) roots_and_mappings = [(get_root_for_mapping(m), m) for m in mapping_nodes] if not roots_and_mappings: logg.info('no mappings configured, skipping search') return [] logg.debug('using mapping roots: %s', [(n1.id, n2.id) for (n1, n2) in roots_and_mappings]) # run one search per root node node_ids = [] guest = get_guest_user() search_languages = get_service_search_languages() for root_node, mapping_node in roots_and_mappings: if root_node is None: logg.error("Configuration problem detected: Z39.50 search mapping '%s' found, but no matching root node", mapping_node.name) continue # map query fields to node attributes field_mapping = {} for field in mapping_node.children: field_mapping[field.name] = field.getDescription().split(';') # XXX: this is redundant - why build an infix query string # XXX: just to parse it afterwards? # XXX: better: create search tree and apply it to a query instead of using node.search() query_string = query.build_query_string(field_mapping) searchtree = search.parse_searchquery_old_style(query_string) if query_string is None: logg.info('unable to map query: [%r] using mapping %s', query, field_mapping) continue logg.info('executing query for node %s: %s', root_node.id, query_string) for n in root_node.search(searchtree, search_languages).filter_read_access(user=guest): node_ids.append(n.id) # use a round-robin algorithm to merge the separate query results # in order to produce maximally diverse results in the first hits # return merge_ids_as_round_robin(node_ids) return node_ids
def ListMetadataFormats(req): if "set" in req.params: return writeError(req, "badArgument") # supported oai metadata formats are configured in section # oai.formats in the mediatum.cfg file d = config.getsubset('oai') formats = [x.strip() for x in d['formats'].split(',') if x.strip()] if "identifier" in req.params: # list only formats available for the given identifier try: nid = identifier2id(req.params.get("identifier")) if nid is None: return writeError(req, "idDoesNotExist") node = q(Node).get(nid) except (TypeError, KeyError): return writeError(req, "badArgument") if node is None: return writeError(req, "badArgument") if not node.has_read_access(user=get_guest_user()): return writeError(req, "noPermission") formats = [x for x in formats if nodeHasOAIExportMask(node, x.lower())] formats = [x for x in formats if filterFormat(node, x.lower())] # write xml for metadata formats list req.write('\n <ListMetadataFormats>\n') for mdf in formats: try: req.write(""" <metadataFormat> <metadataPrefix>%s</metadataPrefix> <schema>%s</schema> <metadataNamespace>%s</metadataNamespace> </metadataFormat> """ % (mdf, d["schema.%s" % mdf], d["namespace.%s" % mdf])) except: logg.exception( "%s: OAI error reading oai metadata format %s from config file", __file__, mdf) req.write('\n</ListMetadataFormats>') if DEBUG: timetable_update(req, "leaving ListMetadataFormats")
def getAccessRights(node): """ Get acccess rights for the public. The values returned descend from http://wiki.surffoundation.nl/display/standards/info-eu-repo/#info-eu-repo-AccessRights. This values are used by OpenAIRE portal. """ try: # if node.get('updatetime') is empty, the method parse_date would raise an exception l_date = parse_date(node.get('updatetime')) except: l_date = date.now() guest_user = get_guest_user() if date.now() < l_date: return "embargoedAccess" elif node.has_read_access(user=guest_user): if node.has_data_access(user=guest_user): return "openAccess" else: return "restrictedAccess" else: return "closedAccess"
def getLinks(self): guest_user = get_guest_user() l = [Link("/logout", t(self.language, "sub_header_logout_title"), t(self.language, "sub_header_logout"), icon="/img/logout.gif")] if self.user == guest_user: if config.get("config.ssh") == "yes": host = config.get("host.name") or self.host l = [Link("https://" + host + "/login", t(self.language, "sub_header_login_title"), t(self.language, "sub_header_login"), icon="/img/login.gif")] else: l = [Link("/login", t(self.language, "sub_header_login_title"), t(self.language, "sub_header_login"), icon="/img/login.gif")] if self.is_workflow_area: l += [Link("/", t(self.language, "sub_header_frontend_title"), t(self.language, "sub_header_frontend"), icon="/img/frontend.gif")] if self.user.is_editor: idstr = "" if self.id: idstr = "?id=" + unicode(self.id) # set edit-link to upload_dir if user comes from collections if not self.id or int(self.id) == get_collections_node().id: if self.user.upload_dir: idstr = "?id=" + unicode(self.user.upload_dir.id) l += [Link("/edit" + idstr, t(self.language, "sub_header_edit_title"), t(self.language, "sub_header_edit"), icon="/img/edit.gif")] if self.user.is_admin: l += [Link("/admin", t(self.language, "sub_header_administration_title"), t(self.language, "sub_header_administration"), icon="/img/admin.gif")] if self.user.is_workflow_editor: l += [Link("/publish/", t(self.language, "sub_header_workflow_title"), t(self.language, "sub_header_workflow"), icon="/img/workflow.gif")] if self.user.can_change_password: l += [Link("/pwdchange", t(self.language, "sub_header_changepwd_title"), t(self.language, "sub_header_changepwd"), "_parent", icon="/img/changepwd.gif")] return l
def build_accessfunc_arguments(user=None, ip=None, date=None, req=None): """Build the expected arguments for the DB permission procedures has_*_access_to_node() IP and date are returned unchanged when passed to this function. For missing arguments, default values are set from request information or current date. :returns: 3-tuple of group_ids, ip and date For admin users, it returns (None, None, None) which means: ignore all access checks. Users can test for this and skip permission checks completely. """ from core.users import get_guest_user if user is None and ip is None: if req is None: req = request from core.users import user_from_session user = user_from_session(req.session) # XXX: like in mysql version, what's the real solution? try: ip = IPv4Address(req.remote_addr) except AddressValueError: logg.warn("illegal IP address %s, refusing IP-based access", req.remote_addr) ip = None if user is None: user = get_guest_user() # admin sees everything ;) if user.is_admin: return (None, None, None) if ip is None: ip = IPv4Address("0.0.0.0") if date is None: date = sqlfunc.current_date() return user.group_ids, ip, date
from core.database.postgres.node import t_noderelation from core.database.postgres import alchemyext # change this to True in your IPython notebook after running mediatumipython.py IPYTHON_NOTEBOOK = False # use default connection specified by mediatum config for ipython-sql magic SQLMAGICS_CONNECTION_FACTORY = lambda: core.db.connectstr # TODO: changing the connection string should be possible for the postgres connector, too from core.users import get_guest_user try: guest_user = get_guest_user() except: guest_user = None # we don't want to raise warnings for missing node classes, just stub them and be silent _core_init_loglevel = logging.getLogger("core.init").level logging.getLogger("core.init").setLevel(logging.ERROR) initmodule.check_undefined_nodeclasses(stub_undefined_nodetypes=True) logging.getLogger("core.init").setLevel(_core_init_loglevel) from core import db, Node, File, NodeToFile from core import User, UserGroup, AuthenticatorInfo from core import AccessRule, AccessRuleset, NodeToAccessRule, NodeToAccessRuleset from core import Fts, Setting from core import app
def getLinks(self): guest_user = get_guest_user() l = [ Link("/logout", t(self.language, "sub_header_logout_title"), t(self.language, "sub_header_logout"), icon="/img/logout.gif") ] if self.user == guest_user: if config.get("config.ssh") == "yes": host = config.get("host.name") or self.host l = [ Link("https://" + host + "/login", t(self.language, "sub_header_login_title"), t(self.language, "sub_header_login"), icon="/img/login.gif") ] else: l = [ Link("/login", t(self.language, "sub_header_login_title"), t(self.language, "sub_header_login"), icon="/img/login.gif") ] if self.is_workflow_area: l += [ Link("/", t(self.language, "sub_header_frontend_title"), t(self.language, "sub_header_frontend"), icon="/img/frontend.gif") ] if self.user.is_editor: idstr = "" if self.id: idstr = "?id=" + unicode(self.id) # set edit-link to upload_dir if user comes from collections if not self.id or int(self.id) == get_collections_node().id: if self.user.upload_dir: idstr = "?id=" + unicode(self.user.upload_dir.id) l += [ Link("/edit" + idstr, t(self.language, "sub_header_edit_title"), t(self.language, "sub_header_edit"), icon="/img/edit.gif") ] if self.user.is_admin: l += [ Link("/admin", t(self.language, "sub_header_administration_title"), t(self.language, "sub_header_administration"), icon="/img/admin.gif") ] if self.user.is_workflow_editor: l += [ Link("/publish/", t(self.language, "sub_header_workflow_title"), t(self.language, "sub_header_workflow"), icon="/img/workflow.gif") ] if self.user.can_change_password: l += [ Link("/pwdchange", t(self.language, "sub_header_changepwd_title"), t(self.language, "sub_header_changepwd"), "_parent", icon="/img/changepwd.gif") ] return l
def get_node_data_struct(req, path, params, data, id, debug=True, allchildren=False, singlenode=False, parents=False, send_children=False, fetch_files=False, csv=False): res = _prepare_response() timetable = res["timetable"] # verify signature if a user is given, otherwise use guest user if params.get('user'): user = _handle_oauth(res, req.fullpath, params, timetable) else: user = get_guest_user() res['oauthuser'] = '' # username supplied for authentication (login name) in query parameter user if user is not None: res['username'] = user.login_name res['userid'] = user.id else: res['userid'] = '' # unique id for authenticated user if applicable (node.id for internal, dirid for dynamic users) res['username'] = '' # name of the user, may be the name of the guest user or a personal name result_shortlist = [] # query parameters typefilter = params.get( 'type', '') # return only nodes of given type like dissertation/diss parent_type = params.get( 'parent_type', '' ) # return only nodes that have only parents of given type like folder or collection # XXX: do we want version support? # send_versions = params.get('send_versions', '').lower() # return also nodes that are older versions of other nodes # return only nodes that have an EXIF location that lies between the given lon,lat values exif_location_rect = params.get('exif_location_rect', '') mdt_name = params.get('mdt_name', '') attrreg = params.get('attrreg', '') searchquery = params.get('q', '') # node query sortfield = params.get('sortfield', '') sortformat = params.get('sortformat', '') # 'sissfi' limit = params.get("limit", DEFAULT_NODEQUERY_LIMIT) offset = params.get("start", 0) csv_allchildren = csv and allchildren # check node existence node = q(Node).get(id) if node is None: return _client_error_response(404, u"node not found") home = get_home_root_node() collections = get_collections_node() # check node access if node.has_read_access(user=user) and (node.is_descendant_of(collections) or node.is_descendant_of(home)): pass else: return _client_error_response(403, u"forbidden") if mdt_name: mdt = q(Metadatatype).filter_by(name=mdt_name).count() if not mdt: return _client_error_response( 404, u'no such metadata type: ' + mdt_name) if allchildren: if csv: # fetch only those columns which are needed, this is faster than fetch all columns and need less space nodequery = node.all_children_by_query( q(Node.attrs.label("attributes"), Node.id, Node.name, Node.schema, Node.type)) else: nodequery = node.all_children elif parents: nodequery = node.parents else: nodequery = node.children if searchquery: search_languages = get_service_search_languages() try: searchtree = search.parse_searchquery_old_style(searchquery) except search.SearchQueryException as e: return _client_error_response(400, str(e)) nodequery = apply_searchtree_to_query(nodequery, searchtree, search_languages) if typefilter: nodequery = nodequery.filter( (Node.type + "/" + Node.schema).op("~")(typefilter)) if attrreg: spl = attrreg.split('=') if len(spl) != 2: return _client_error_response(400, "wrong attrreg value: " + attrreg) akey, aval = spl nodequery = nodequery.filter(Node.attrs[akey].astext.op("~")(aval)) sortdirection = u"" if sortfield: sfields = [x.strip() for x in sortfield.split(',')] sfields_without_sign = [] sortformat = sortformat[:len(sfields)] for sfield, sformat in izip_longest(sfields, sortformat, fillvalue="s"): if sformat == "i": astype = Integer elif sformat == "f": astype = Float else: astype = Unicode if sfield[0] == "-": sfield = sfield[1:] desc = True sortdirection += u"d" else: desc = False sortdirection += u"u" sfields_without_sign.append(sfield) if sfield == 'node.id': order_expr = Node.id elif sfield == 'node.name': order_expr = Node.name elif sfield == 'node.type': order_expr = Node.type elif sfield == 'node.orderpos': order_expr = Node.orderpos else: order_expr = Node.attrs[sfield].cast(astype) if desc: order_expr = sql.desc(order_expr) nodequery = nodequery.order_by(order_expr.nullslast()) sfields = sfields_without_sign else: sfields = [] ### TODO: do we need this? if parent_type: raise NotImplementedError("parent_type not supported at the moment") # XXX: do we need this? pass ### actually get the nodes if csv_allchildren: nodequery = nodequery.order_by('attributes').distinct() else: nodequery = nodequery.distinct().options(undefer(Node.attrs)) if fetch_files: nodequery = nodequery.options(joinedload(Node.file_objects)) if singlenode: # we already checked that node can be accessed by the user, just return the node nodelist = [node] node_count = 1 limit = 1 else: if mdt_name: nodequery = nodequery.filter(Node.schema == mdt_name) nodequery = nodequery.filter_read_access(user=user) if offset: nodequery = nodequery.offset(offset) if limit: nodequery = nodequery.limit(limit) atime = time.time() try: nodelist = nodequery.all() except Exception as e: return _client_error_response( 400, "the database failed with the message: {}".format(str(e))) node_count = len(nodelist) timetable.append([ 'fetching nodes from db returned {} results'.format(node_count), time.time() - atime ]) atime = time.time() i0 = int(params.get('i0', '0')) i1 = int(params.get('i1', node_count)) def attr_list(node, sfields): r = [] for sfield in sfields: r.append([sfield, node.get(sfield)]) return r if 'add_shortlist' in params: if sortfield: result_shortlist = [[ i, x.id, x.name, x.type, attr_list(x, sfields) ] for i, x in enumerate(nodelist)][i0:i1] timetable.append([ 'build result_shortlist for %d nodes and %d sortfields' % (len(result_shortlist), len(sfields)), time.time() - atime ]) atime = time.time() else: result_shortlist = [[i, x.id, x.name, x.type] for i, x in enumerate(nodelist)][i0:i1] timetable.append([ 'build result_shortlist for %d nodes (no sortfield)' % len(result_shortlist), time.time() - atime ]) atime = time.time() ### XXX: filtering in python, should be moved to the database if exif_location_rect: raise NotImplementedError("not supported at the moment") components = exif_location_rect.split(',') if len(components) != 4: return _client_error_response( 400, u"exif_location_rect is invalid: {}".format( exif_location_rect)) nodelist = _exif_location_filter(nodelist, components) ### build result res['nodelist'] = nodelist res['sfields'] = sfields res['sortfield'] = sortfield res['sortdirection'] = sortdirection res['result_shortlist'] = result_shortlist res['timetable'] = timetable res['nodelist_start'] = offset res['nodelist_limit'] = limit res['nodelist_count'] = node_count res['path'] = req.path res['status'] = 'ok' res['html_response_code'] = '200' # ok res['build_response_end'] = time.time() dataready = "%.3f" % (res['build_response_end'] - res["build_response_start"]) res['dataready'] = dataready return res
def struct2rss(req, path, params, data, struct, debug=False, singlenode=False, send_children=False): nodelist = struct['nodelist'] language = params.get('lang', 'en') items_list = [] host = u"http://" + unicode(_get_header(req, "HOST") or configured_host) collections = get_collections_node() user = get_guest_user() for n in nodelist: nodename = n.name nodeid = str(n.id) updatetime = utime = try_node_date(n) # categories to be included in all items - mask generated or not default_categories = u'<category>node type: ' + n.type + '/' + n.schema + u'</category>\r\n' # check for export mask for this node try: try: mdt = n.metadatatype except: mdt = None mask = mdt.getMask('rss') if mask.get('masktype') != 'export': mask = None except: mask = None if mask: item_xml = u'<item>\r\n' + mask.getViewHTML( [n], flags=8) + default_categories + u'\r\n</item>\r\n' items_list = items_list + [(updatetime, nodename, nodeid, item_xml) ] continue # no rss export mask: build default item from nodesmall mask item_d = {} browsingPathList = getBrowsingPathList(n) browsingPathList = [ x for x in browsingPathList if x[-1].has_read_access( user=user) and x[-1].is_descendant_of(collections) ] browsingPathList_names = [ map(lambda x: x.name, browsingPath) for browsingPath in browsingPathList ] # assumption: longest path is most detailled and illustrative for being used in the title x = sorted([[len(p), i, p] for i, p in enumerate(browsingPathList_names)]) x.reverse() try: most_detailed_path = x[0][2] except: # browsing path list may be empty (for directories, collections, ...) most_detailed_path = '' item_d['title'] = esc(u"{} ({}, {}/{}) {}".format( nodename or u'-unnamed-node-', nodeid, n.type, n.schema, u"/".join(most_detailed_path))) item_d['item_pubDate'] = utime item_d['guid'] = host + u'/node?id=%s' % nodeid item_d['link'] = host + u'/node?id=%s' % nodeid if mdt: lang_mask = mdt.masks.filter( Node.name.startswith(u"nodesmall")).filter( Node.a.language == language).first() if lang_mask is not None: mask = lang_mask else: mask = mdt.get_mask('nodesmall') else: mask = None if mask is not None: attr_list = mask.getViewHTML( [n], VIEW_DATA_ONLY, language) # [[attr_name, value, label, type], ...] else: attr_list = [ ['', n.id, 'node id', ''], ['', n.name, 'node name', ''], ['', n.type + "/" + n.schema, 'node type', ''], ] description = u'' for x in attr_list: description = description + (u'''<b>%s: </b>%s<br/>\r\n''' % (x[2], x[1])) item_d['description'] = description categories = default_categories for x in browsingPathList_names: categories = categories + u'<category>' + esc( u'/'.join(x)) + u'</category>\r\n' ddcs = n.get('ddc').strip() if ddcs.strip(): ddcs = ddcs.split(';') for ddc in ddcs: categories = categories + u'<category>' + esc( ddc) + u'</category>\r\n' subjects = n.get('subject').strip() if subjects: subjects = subjects.split(';') for subject in subjects: categories = categories + u'<category>' + esc( subject) + u'</category>\r\n' item_d['categories'] = categories for k, v in item_d.items(): item_d[k] = v items_list = items_list + [(updatetime, nodename, nodeid, (template_rss_item % item_d))] if items_list: items_list.sort() items_list.reverse() items = '' for x in items_list: items += (x[3] + u'\r\n') pubDate = lastBuildDate = format_date(format='rfc822') struct['dataready'] = (u"%.3f" % (time.time() - struct['build_response_start'])) fcd = feed_channel_dict.copy() fcd['lang'] = u'de' fcd['pubdate'] = pubDate fcd['lastbuild'] = lastBuildDate fcd['link'] = host fcd['atom_link'] = host + req.fullpath fcd['image_title'] = 'testlogo' fcd['image_link'] = host + u'/img/testlogo.png' fcd['image_url'] = host + u'/img/testlogo.png' if 'feed_info' in params: for k, v in params['feed_info'].items(): fcd[k] = v else: fcd['title'] = host + req.fullpath + req.query fcd['items'] = items s = template_rss_channel % fcd # params['feed_info'] return s.encode("utf8")
def search_nodes(query, mapping_prefix='Z3950_search_'): """ Search nodes that match the query. 'query' is a tree of QueryBoolNode and QueryMatchNode objects. Query root nodes are configured by a naming convention. The names of mappings that starting with the given 'mapping_prefix' must end with a node ID, which is then used as root node for the search based on that field mapping. """ def get_root_for_mapping(mapping_node): name = mapping_node.name node_id = name[len(mapping_prefix):] node = q(Node).get(node_id) return node mapping_nodes = q(Mapping).filter(Mapping.name.startswith(mapping_prefix)) roots_and_mappings = [(get_root_for_mapping(m), m) for m in mapping_nodes] if not roots_and_mappings: logg.info('no mappings configured, skipping search') return [] logg.debug('using mapping roots: %s', [(n1.id, n2.id) for (n1, n2) in roots_and_mappings]) # run one search per root node node_ids = [] guest = get_guest_user() search_languages = get_service_search_languages() for root_node, mapping_node in roots_and_mappings: if root_node is None: logg.error( "Configuration problem detected: Z39.50 search mapping '%s' found, but no matching root node", mapping_node.name) continue # map query fields to node attributes field_mapping = {} for field in mapping_node.children: field_mapping[field.name] = field.getDescription().split(';') # XXX: this is redundant - why build an infix query string # XXX: just to parse it afterwards? # XXX: better: create search tree and apply it to a query instead of using node.search() query_string = query.build_query_string(field_mapping) searchtree = search.parse_searchquery_old_style(query_string) if query_string is None: logg.info('unable to map query: [%r] using mapping %s', query, field_mapping) continue logg.info('executing query for node %s: %s', root_node.id, query_string) for n in root_node.search( searchtree, search_languages).filter_read_access(user=guest): node_ids.append(n.id) # use a round-robin algorithm to merge the separate query results # in order to produce maximally diverse results in the first hits # return merge_ids_as_round_robin(node_ids) return node_ids
def create(): """ Creates the sitemap files and the sitemap index files which are located at /web/root/ """ logging.getLogger('everything').info( 'Creating Sitemaps and Sitemap Index...') from core.users import get_guest_user base_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), os.pardir)) hostname = config.get('host.name') root = q(Collections).one() guest_user = get_guest_user() all_nodes = root.all_children_by_query( q(Node)).filter_read_access(user=guest_user) sitemaps = [] node_dict = { 'collection': [], 'directory': [], 'document': [], 'dissertation': [], 'image': [], 'video': [], 'audio': [], } for node in all_nodes: # Arkitekt had a guest field that is actually not visible if node.has_read_access(user=guest_user): for node_type in node_dict.keys(): if node_type in q(Node).get(node.id).type: node_dict[node_type].append( (unicode(node.id), q(Node).get(node.id).updatetime)) # Reassign node_dict to a dict where empty values were removed node_dict = dict((k, v) for k, v in node_dict.iteritems() if v) # Sitemap can have at most 50k entries for key in node_dict.keys(): if key in ('dissertation', 'document', 'image'): priority_level = '1.0' elif key == 'videos': priority_level = '0.8' else: priority_level = '0.5' # Create multiple sitemaps for node lists > 50k if len(node_dict[key]) > 50000: partitions = int(ceil((len(node_dict[key]) / 50000.))) for partition_number in range(partitions): sitemap = Sitemap( base_dir, ''.join( ['sitemap-', str(key), str(partition_number), '.xml']), hostname) sitemaps.append(sitemap.name) sitemap.create_sitemap( node_dict[key][partition_number * 50000:(partition_number + 1) * 50000], priority_level) else: sitemap = Sitemap(base_dir, ''.join(['sitemap-', key, '.xml']), hostname) sitemaps.append(sitemap.name) sitemap.create_sitemap(node_dict[key], priority_level) siteindex = SitemapIndex(base_dir, 'sitemap-index.xml', hostname) now = '+'.join( [datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), '02:00']) siteindex.create_sitemap_index(sitemaps, now) logging.getLogger('everything').info( 'Generation of Sitemaps and SitemapIndex Complete')
def retrieveNodes(req, setspec, date_from=None, date_to=None, metadataformat=None): schemata = [] nodequery = None res = [] if metadataformat == 'mediatum': metadatatypes = q(Metadatatypes).one().children schemata = [m.name for m in metadatatypes if m.type == 'metadatatype' and m.name not in ['directory', 'collection']] elif metadataformat: schemata = get_schemata_for_metadataformat(metadataformat) if DEBUG: timetable_update(req, "in retrieveNodes: find schemata with export mask for metadata type %s (%d found: '%s')" % (metadataformat.lower(), len(schemata), ustr([x for x in schemata]))) if setspec: nodequery = oaisets.getNodesQueryForSetSpec(setspec, schemata) # if for this oai group set no function is defined that retrieve the nodes query, use the filters if not nodequery: collections_root = q(Collections).one() nodequery = collections_root.all_children setspecFilter = oaisets.getNodesFilterForSetSpec(setspec, schemata) if schemata: nodequery = nodequery.filter(Node.schema.in_(schemata)) if type(setspecFilter) == list: for sFilter in setspecFilter: nodequery = nodequery.filter(sFilter) else: nodequery = nodequery.filter(setspecFilter) else: collections_root = q(Collections).one() nodequery = collections_root.all_children nodequery = nodequery.filter(Node.schema.in_(schemata)) if DEBUG: timetable_update(req, "in retrieveNodes: after building NodeList for %d nodes" % (len(res))) if date_from: nodequery = nodequery.filter(Node.attrs[DATEFIELD].astext >= str(date_from)) if DEBUG: timetable_update(req, "in retrieveNodes: after filtering date_from --> %d nodes" % (len(res))) if date_to: nodequery = nodequery.filter(Node.attrs[DATEFIELD].astext <= str(date_to)) if DEBUG: timetable_update(req, "in retrieveNodes: after filtering date_to --> %d nodes" % (len(res))) if nodequery: guest_user = get_guest_user() nodequery = nodequery.filter_read_access(user=guest_user) else: res = [n for n in res if n.has_read_access(user=get_guest_user())] if DEBUG: timetable_update(req, "in retrieveNodes: after read access filter --> %d nodes" % (len(res))) if not nodequery: collections = q(Collections).one() res = [n for n in res if isDescendantOf(n, collections)] if DEBUG: timetable_update(req, "in retrieveNodes: after checking descendance from basenode --> %d nodes" % (len(res))) # superflous ?! #if schemata: # res = [n for n in res if n.getSchema() in schemata] # if DEBUG: # timetable_update(req, "in retrieveNodes: after schemata (%s) filter --> %d nodes" % (ustr(schemata), len(res))) if metadataformat and metadataformat.lower() in FORMAT_FILTERS.keys(): format_string = metadataformat.lower() format_filter = FORMAT_FILTERS[format_string]['filterQuery'] nodequery = nodequery.filter(format_filter) #res = [n for n in res if filterFormat(n, format_string)] if DEBUG: timetable_update(req, "in retrieveNodes: after format (%s) filter --> %d nodes" % (format_string, len(res))) if nodequery: res = nodequery return res