def get_month(date, ln=CFG_SITE_LANG, default=""): """ Returns the year from a textual date retrieved from a record The returned value is the 3 letters short month name in language 'ln' If year cannot be found, returns 'default' @param date the textual date to retrieve the year from @param default a default value to return if year not fount """ from invenio.dateutils import get_i18n_month_name from invenio.messages import language_list_long # Look for textual month like "Jan" or "sep" or "November" or "novem" # Limit to CFG_SITE_LANG as language first (most probable date) # Look for short months. Also matches for long months short_months = [get_i18n_month_name(month).lower() for month in range(1, 13)] # ["jan","feb","mar",...] short_months_pattern = re.compile(r"(" + r"|".join(short_months) + r")", re.IGNORECASE) # (jan|feb|mar|...) result = short_months_pattern.search(date) if result is not None: try: month_nb = short_months.index(result.group().lower()) + 1 return get_i18n_month_name(month_nb, "short", ln) except: pass # Look for month specified as number in the form 2004/03/08 or 17 02 2004 # (always take second group of 2 or 1 digits separated by spaces or - etc.) month_pattern = re.compile( r"\d([\s]|[-/.,])\ +(?P<month>(\d){1,2})([\s]|[-/.,])" ) result = month_pattern.search(date) if result is not None: try: month_nb = int(result.group("month")) return get_i18n_month_name(month_nb, "short", ln) except: pass # Look for textual month like "Jan" or "sep" or "November" or "novem" # Look for the month in each language # Retrieve ['en', 'fr', 'de', ...] language_list_short = [x[0] for x in language_list_long()] for lang in language_list_short: # For each language # Look for short months. Also matches for long months short_months = [ get_i18n_month_name(month, "short", lang).lower() for month in range(1, 13) ] # ["jan","feb","mar",...] short_months_pattern = re.compile(r"(" + r"|".join(short_months) + r")", re.IGNORECASE) # (jan|feb|mar|...) result = short_months_pattern.search(date) if result is not None: try: month_nb = short_months.index(result.group().lower()) + 1 return get_i18n_month_name(month_nb, "short", ln) except: pass return default
def test_lang_list_long_ordering(self): """messages - preserving language order""" lang_list_long = messages.language_list_long() # Preliminary test: same number of languages in both lists self.assertEqual(len(lang_list_long), len(CFG_SITE_LANGS)) for lang, cfg_lang in zip(lang_list_long, CFG_SITE_LANGS): self.assertEqual(lang[0], cfg_lang)
def get_locale_value(): """Returns all the available languages""" sm_locale_code = '''<select name="sm_locale">''' sm_locale_code += '''<option value='en'>English (default)</option>''' langs = language_list_long(True) for lang in langs: if lang[0] != 'en': # we already added English as default sm_locale_code += '''<option value='%(lang_short)s'>%(lang_long)s</option>''' \ % {'lang_short': lang[0], 'lang_long': lang[1]} sm_locale_code += '''</select>''' return sm_locale_code
def get_locale_value(): """Returns all the available languages""" sm_locale_code = """<select name="sm_locale">""" sm_locale_code += """<option value='en'>English (default)</option>""" langs = language_list_long(True) for lang in langs: if lang[0] != "en": # we already added English as default sm_locale_code += """<option value='%(lang_short)s'>%(lang_long)s</option>""" % { "lang_short": lang[0], "lang_long": lang[1], } sm_locale_code += """</select>""" return sm_locale_code
def update_webpage_cache(self): """Create collection page header, navtrail, body (including left and right stripes) and footer, and call write_cache_file() afterwards to update the collection webpage cache.""" ## precalculate latest additions for non-aggregate ## collections (the info is ln and as independent) if self.dbquery and not CFG_WEBSEARCH_I18N_LATEST_ADDITIONS: self.create_latest_additions_info() ## do this for each language: for lang, lang_fullname in language_list_long(): # but only if some concrete language was not chosen only: if lang in task_get_option("language", [lang]): if self.dbquery and CFG_WEBSEARCH_I18N_LATEST_ADDITIONS: self.create_latest_additions_info(ln=lang) # load the right message language _ = gettext_set_language(lang) ## first, update navtrail: for aas in CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES: self.write_cache_file("navtrail-as=%s-ln=%s" % (aas, lang), self.create_navtrail_links(aas, lang)) ## second, update page body: for aas in CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES: # do light, simple and advanced search pages: body = websearch_templates.tmpl_webcoll_body( ln=lang, collection=self.name, te_portalbox = self.create_portalbox(lang, 'te'), searchfor = self.create_searchfor(aas, lang), np_portalbox = self.create_portalbox(lang, 'np'), narrowsearch = self.create_narrowsearch(aas, lang, 'r'), focuson = self.create_narrowsearch(aas, lang, "v") + \ self.create_external_collections_box(lang), instantbrowse = self.create_instant_browse(aas=aas, ln=lang), ne_portalbox = self.create_portalbox(lang, 'ne') ) self.write_cache_file("body-as=%s-ln=%s" % (aas, lang), body) ## third, write portalboxes: self.write_cache_file("portalbox-tp-ln=%s" % lang, self.create_portalbox(lang, "tp")) self.write_cache_file("portalbox-te-ln=%s" % lang, self.create_portalbox(lang, "te")) self.write_cache_file("portalbox-lt-ln=%s" % lang, self.create_portalbox(lang, "lt")) self.write_cache_file("portalbox-rt-ln=%s" % lang, self.create_portalbox(lang, "rt")) ## fourth, write 'last updated' information: self.write_cache_file("last-updated-ln=%s" % lang, convert_datestruct_to_dategui(time.localtime(), ln=lang)) return
def tmpl_language_selection_box(self, req, language=CFG_SITE_LANG): """Take URLARGS and LANGUAGE and return textual language selection box for the given page. Parameters: - 'req' - The mod_python request object - 'language' *string* - The selected language """ # load the right message language _ = gettext_set_language(language) # Work on a copy in order not to bork the arguments of the caller argd = {} if req and req.args: argd.update(cgi.parse_qs(req.args)) parts = [] for (lang, lang_namelong) in language_list_long(): if lang == language: parts.append('<span class="langinfo">%s</span>' % lang_namelong) else: # Update the 'ln' argument in the initial request argd['ln'] = lang if req and req.uri: args = urllib.quote( req.uri, '/:?') + make_canonical_urlargd(argd, {}) else: args = "" parts.append( create_html_link(args, {}, lang_namelong, {'class': "langinfo"})) if len(parts) > 1: return _("This site is also available in the following languages:") + \ "<br />" + ' '.join(parts) else: ## There is only one (or zero?) languages configured, ## so there so need to display language alternatives. return ""
def tmpl_language_selection_box(self, req, language=CFG_SITE_LANG): """Take URLARGS and LANGUAGE and return textual language selection box for the given page. Parameters: - 'req' - The mod_python request object - 'language' *string* - The selected language """ # load the right message language _ = gettext_set_language(language) # Work on a copy in order not to bork the arguments of the caller argd = {} if req and req.args: argd.update(cgi.parse_qs(req.args)) parts = [] for (lang, lang_namelong) in language_list_long(): if lang == language: parts.append('<span class="langinfo">%s</span>' % lang_namelong) else: # Update the 'ln' argument in the initial request argd['ln'] = lang if req and req.uri: args = urllib.quote(req.uri, '/:?') + make_canonical_urlargd(argd, {}) else: args = "" parts.append(create_html_link(args, {}, lang_namelong, {'class': "langinfo"})) if len(parts) > 1: return _("This site is also available in the following languages:") + \ "<br />" + ' '.join(parts) else: ## There is only one (or zero?) languages configured, ## so there so need to display language alternatives. return ""
def create_invenio_flask_app(**kwargs_config): """ Prepare WSGI Invenio application based on Flask. Invenio consists of a new Flask application with legacy support for the old WSGI legacy application and the old Python legacy scripts (URLs to *.py files). An incoming request is processed in the following manner: * The Flask application first routes request via its URL routing system (see LegacyAppMiddleware.__call__()). * One route in the Flask system, will match Python legacy scripts (see static_handler_with_legacy_publisher()). * If the Flask application aborts the request with a 404 error, the request is passed on to the WSGI legacy application (see page_not_found()). E.g. either the Flask application did not find a route, or a view aborted the request with a 404 error. """ def decorate_build(f): @wraps(f) def decorator(*args, **kwargs): scheme_url = { 'http': current_app.config['CFG_SITE_URL'], 'https': current_app.config['CFG_SITE_SECURE_URL'] } force_external = kwargs.get('force_external', True) url_scheme = getattr(f.im_self, 'url_scheme', 'http') kwargs['force_external'] = False url = f(*args, **kwargs) if force_external: url = scheme_url.get(url_scheme) + url return url return decorator class InvenioFlask(Flask): def create_url_adapter(self, request): url_adapter = super(InvenioFlask, self).create_url_adapter(request) if url_adapter is not None and hasattr(url_adapter, 'build'): url_adapter.build = decorate_build(url_adapter.build) return url_adapter ## The Flask application instance _app = InvenioFlask( __name__, ## Static files are usually handled directly by the webserver (e.g. Apache) ## However in case WSGI is required to handle static files too (such ## as when running simple server), then this flag can be ## turned on (it is done automatically by wsgi_handler_test). ## We assume anything under '/' which is static to be server directly ## by the webserver from CFG_WEBDIR. In order to generate independent ## url for static files use func:`url_for('static', filename='test')`. static_url_path='', static_folder=CFG_WEBDIR) ## Update application config from parameters. _app.config.update(kwargs_config) if 'SQLALCHEMY_DATABASE_URI' not in _app.config: from sqlalchemy.engine.url import URL # Global variables from invenio.dbquery import CFG_DATABASE_HOST, CFG_DATABASE_PORT,\ CFG_DATABASE_NAME, CFG_DATABASE_USER, CFG_DATABASE_PASS, \ CFG_DATABASE_TYPE _app.config['SQLALCHEMY_DATABASE_URI'] = URL( CFG_DATABASE_TYPE, username=CFG_DATABASE_USER, password=CFG_DATABASE_PASS, host=CFG_DATABASE_HOST, database=CFG_DATABASE_NAME, port=CFG_DATABASE_PORT, ) ## Let's initialize database. from invenio.sqlalchemyutils import db db.init_app(_app) ## First check that you have all rights to logs from invenio.bibtask import check_running_process_user check_running_process_user() from invenio.pluginutils import PluginContainer from invenio.session_flask import InvenioSessionInterface from invenio.webuser_flask import InvenioLoginManager, current_user, UserInfo from invenio.messages import wash_language, gettext_set_language, \ language_list_long, is_language_rtl from invenio.urlutils import create_url, get_canonical_and_alternates_urls from invenio.cache import cache from invenio.jinja2utils import CollectionExtension, \ LangExtension, hack_jinja2_utf8decoding, \ extend_application_template_filters from flask.ext.assets import Environment, Bundle from invenio.webinterface_handler_flask_utils import unicodifier, InvenioRequest from flaskext.gravatar import Gravatar from werkzeug.wrappers import BaseResponse from werkzeug.exceptions import HTTPException from invenio.flask_sslify import SSLify from invenio.webinterface_handler_wsgi import application as legacy_application from invenio.webinterface_handler_wsgi import is_mp_legacy_publisher_path, \ mp_legacy_publisher # See note on Jinja2 string decoding using ASCII codec instead of UTF8 in # function documentation hack_jinja2_utf8decoding() # Handle both url with and without trailing slashe by Flask. # @blueprint.route('/test') # @blueprint.route('/test/') -> not necessary when strict_slashes == False _app.url_map.strict_slashes = False # SECRET_KEY is needed by Flask Debug Toolbar SECRET_KEY = _app.config.get('SECRET_KEY') or CFG_SITE_SECRET_KEY if not SECRET_KEY or SECRET_KEY == '': fill_secret_key = """ Set variable CFG_SITE_SECRET_KEY with random string in invenio-local.conf. You can use following commands: $ %s $ %s """ % (CFG_BINDIR + os.sep + 'inveniocfg --create-secret-key', CFG_BINDIR + os.sep + 'inveniocfg --update-config-py') try: raise Exception(fill_secret_key) except Exception: register_exception(alert_admin=True, subject="Missing CFG_SITE_SECRET_KEY") raise Exception(fill_secret_key) _app.config["SECRET_KEY"] = SECRET_KEY # Enable Flask Debug Toolbar early to also catch HTTPS redirects if 'debug-toolbar' in getattr(config, 'CFG_DEVEL_TOOLS', []): _app.config["DEBUG_TB_ENABLED"] = True _app.config[ "DEBUG_TB_INTERCEPT_REDIRECTS"] = 'intercept-redirects' in getattr( config, 'CFG_DEVEL_TOOLS', []) from flask_debugtoolbar import DebugToolbarExtension class InvenioDebugToolbarExtension(DebugToolbarExtension): def _show_toolbar(self): user_info = UserInfo(session.get('user_id')) # Enable debug toolbar only for super admin. if not user_info.is_super_admin: return False return super(InvenioDebugToolbarExtension, self)._show_toolbar() InvenioDebugToolbarExtension(_app) # Set email backend for Flask-Email plugin from invenio.mailutils import initialize_email_backend initialize_email_backend(_app) if CFG_HAS_HTTPS_SUPPORT: # Makes request always run over HTTPS. _sslify = SSLify(_app) if not CFG_FULL_HTTPS: @_sslify.criteria_handler def criteria(): """Extends criteria when to stay on HTTP site.""" _force_https = False if request.blueprint in current_app.blueprints: _force_https = current_app.blueprints[request.blueprint].\ _force_https view_func = current_app.view_functions.get(request.endpoint) if view_func is not None and hasattr(view_func, '_force_https'): _force_https = view_func._force_https return not (_force_https or session.need_https()) class LegacyAppMiddleware(object): def __init__(self, app): self.app = app def __call__(self, environ, start_response): if remote_debugger: remote_debugger.start() with self.app.request_context(environ): g.start_response = start_response try: response = self.app.full_dispatch_request() except Exception as e: register_exception(req=request, alert_admin=True) response = self.app.handle_exception(e) return response(environ, start_response) _app.wsgi_app = LegacyAppMiddleware(_app) @_app.errorhandler(404) def page_not_found(error): try: response = legacy_application(request.environ, g.start_response) if not isinstance(response, BaseResponse): response = current_app.make_response(str(response)) return response except HTTPException: return render_template("404.html"), 404 @_app.errorhandler(401) def do_login_first(error=401): """Displays login page when user is not authorised.""" if request.is_xhr: return _("Authorization failure"), 401 flash(_("Authorization failure"), 'error') from invenio.webaccount_blueprint import login return login(referer=request.url), 401 @_app.endpoint('static') @_app.route(_app.static_url_path + '/<path:filename>', methods=['POST', 'PUT']) def static_handler_with_legacy_publisher(*args, **kwargs): """ Adds support for legacy publisher. NOTE: It changes order of url page lookup. First, the invenio_handler will be called and on 404 error the mp_legacy_publisher is called. """ possible_module, possible_handler = is_mp_legacy_publisher_path( request.environ['PATH_INFO']) if possible_module is not None: legacy_publisher = lambda req: \ mp_legacy_publisher(req, possible_module, possible_handler) return legacy_application(request.environ, g.start_response, handler=legacy_publisher) # Static file serving for devserver # --------------------------------- # Apache normally serve all static files, but if we are using the # devserver we need to serve static files here. Werkzeugs default # behaviour is to return a '405 Method not allowed' for POST requests # to static files. However, if we abort all POST requests with 405, the # legacy_application (see page_not_found()) will not be given a chance # to serve static files as it only get's invokved when we abort with a # 404. Hence, on POST requests, we first check if the static file exists, # and if it does we return we abort the request with a 405. if not CFG_FLASK_SERVE_STATIC_FILES: abort(404) else: static_file_response = _app.send_static_file(*args, **kwargs) if request.method in ['POST', 'PUT']: abort(405) else: return static_file_response if CFG_FLASK_CACHE_TYPE not in [None, 'null']: _app.jinja_options = dict(_app.jinja_options, auto_reload=False, cache_size=-1, bytecode_cache=MemcachedBytecodeCache( cache, prefix="jinja::", timeout=3600)) ## Let's customize the template loader to first look into ## /opt/invenio/etc-local/templates and then into ## /opt/invenio/etc/templates _app.jinja_loader = FileSystemLoader([ join(CFG_ETCDIR + '-local', 'templates'), join(CFG_ETCDIR, 'templates') ]) ## Let's attach our session handling (which is bridging with the native ## Invenio session handling _app.session_interface = InvenioSessionInterface() ## Set custom request class _app.request_class = InvenioRequest ## Let's load the whole invenio.config into Flask :-) ... _app.config.from_object(config) ## ... and map certain common parameters _app.config['SESSION_COOKIE_NAME'] = CFG_WEBSESSION_COOKIE_NAME _app.config['PERMANENT_SESSION_LIFETIME'] = \ CFG_WEBSESSION_EXPIRY_LIMIT_REMEMBER * CFG_WEBSESSION_ONE_DAY _app.config['USE_X_SENDFILE'] = CFG_BIBDOCFILE_USE_XSENDFILE _app.config['DEBUG'] = CFG_DEVEL_SITE > 0 _app.debug = CFG_DEVEL_SITE > 0 language_list_long = language_list_long() language_labordoc_list = [ ln for ln in language_list_long if ln[0] in ['en', 'es', 'fr'] ] _app.config['CFG_LANGUAGE_LIST_LONG'] = [ (lang, longname.decode('utf-8')) for (lang, longname) in language_list_long ] _app.config['CFG_LANGUAGE_LABORDOC_LIST'] = [ (lang, longname.decode('utf-8')) for (lang, longname) in language_labordoc_list ] ## Invenio is all using str objects. Let's change them to unicode _app.config.update(unicodifier(dict(_app.config))) ## Cache _app.config['CACHE_TYPE'] = CFG_FLASK_CACHE_TYPE # FIXME problem in Flask-Cache==0.11.1 cache.app = _app cache.init_app(_app) if CFG_FLASK_CACHE_TYPE == 'redis': def with_try_except_block(f): def decorator(*args, **kwargs): try: return f(*args, **kwargs) except Exception: register_exception(alert_admin=True) pass return decorator ## When the redis is down, we would like to keep the site running. cache.cache._client.execute_command = with_try_except_block( cache.cache._client.execute_command) # FIXME problem in Flask-Cache==0.11.1 cache.app = current_app _flask_log_handler = RotatingFileHandler( os.path.join(CFG_LOGDIR, 'flask.log')) _flask_log_handler.setFormatter( Formatter('%(asctime)s %(levelname)s: %(message)s ' '[in %(pathname)s:%(lineno)d]')) _app.logger.addHandler(_flask_log_handler) # Let's create login manager. _login_manager = InvenioLoginManager(_app) _login_manager.login_view = 'webaccount.login' _login_manager.anonymous_user = UserInfo _login_manager.unauthorized_handler(do_login_first) # Let's create main menu. class Menu(object): def __init__(self, id='', title='', url='', order=None, children=None, display=lambda: True): self.id = id self.title = title self.url = url self.children = children or {} self.order = order or 100 self.display = display # Let's create assets environment. _assets = Environment(_app) _assets.debug = 'assets-debug' in getattr(config, 'CFG_DEVEL_TOOLS', []) _assets.directory = config.CFG_WEBDIR def _jinja2_new_bundle(tag, collection, name=None): if not _assets.debug: files = [ f for f in collection if os.path.isfile(os.path.join(_assets.directory, f)) ] if len(files) != len(collection): ## Turn on debuging to generate 404 request on missing files. _assets.debug = True current_app.logger.error( 'Missing files: ' + ','.join(set(collection) - set(files))) if len(collection): return Bundle(output="%s/%s-%s.%s" % (tag, 'invenio' if name is None else name, hash('|'.join(collection)), tag), *collection) _app.jinja_env.extend(new_bundle=_jinja2_new_bundle, default_bundle_name='90-invenio') _app.jinja_env.add_extension(CollectionExtension) _app.jinja_env.add_extension(LangExtension) _app.jinja_env.add_extension('jinja2.ext.do') # Let's extend application with custom template filters. extend_application_template_filters(_app) # Let's create Gravatar bridge. _gravatar = Gravatar(_app, size=100, rating='g', default='retro', force_default=False, force_lower=False) del _gravatar # Let's set the user language from invenio.webinterface_handler_flask_utils import guess_language _app.before_request(guess_language) # Let's extend application with more custom templete filters from invenio.jinja2utils import inject_utils _app.context_processor(inject_utils) @_login_manager.user_loader def _load_user(uid): """ Function should not raise an exception if uid is not valid or User was not found in database. """ return UserInfo(int(uid)) @_app.before_request def reset_template_context_processor(): g._template_context_processor = [] @_app.context_processor def _inject_template_context(): context = {} if not hasattr(g, '_template_context_processor'): reset_template_context_processor() for func in g._template_context_processor: context.update(func()) return context def _invenio_blueprint_plugin_builder(plugin): """ Handy function to bridge pluginutils with (Invenio) blueprints. """ if plugin.__name__ in CFG_FLASK_DISABLED_BLUEPRINTS or \ plugin.__name__.split('.')[-1] in CFG_FLASK_DISABLED_BLUEPRINTS: _app.logger.info( '%s is excluded by CFG_FLASK_DISABLED_BLUEPRINTS' % plugin_name) return from invenio.webinterface_handler_flask_utils import InvenioBlueprint if 'blueprint' in dir(plugin): candidate = getattr(plugin, 'blueprint') if isinstance(candidate, InvenioBlueprint): return candidate _app.logger.error('%s is not a valid blueprint plugin' % plugin_name) ## Let's load all the blueprints that are composing this Invenio instance _BLUEPRINTS = [ m for m in map( _invenio_blueprint_plugin_builder, autodiscover_modules(['invenio'], related_name_re='.+_blueprint.py', ignore_exceptions=True)) if m is not None ] _app.config['breadcrumbs_map'] = {} _app.config['menubuilder_map'] = {} ## Let's attach all the blueprints from invenio.webinterface_handler_flask_utils import _ for plugin in _BLUEPRINTS: _app.register_blueprint(plugin) if plugin.config: ## Let's include the configuration parameters of the config file. ## E.g. if the blueprint specify the config string ## 'invenio.webmessage_config' any uppercase variable defined in ## the module invenio.webmessage_config is loaded into the system. _app.config.from_object(plugin.config) if plugin.breadcrumbs: _app.config['breadcrumbs_map'][plugin.name] = plugin.breadcrumbs _app.config['breadcrumbs_map'].update(plugin.breadcrumbs_map) ## Let's build global menu. Each blueprint can plug its own menu items. if plugin.menubuilder: _app.config['menubuilder_map'].update( (m[0], Menu(*m)) for m in plugin.menubuilder) _app.config['menubuilder_map'].update(plugin.menubuilder_map) _app.config['menubuilder_map'].update({ 'main.admin': Menu('main.admin', _('Administration'), 'help.admin', 9998, [], lambda: current_user.is_admin if current_user else False), 'main.help': Menu('main.help', _('Help'), 'help', 9999) }) menu = { 'main': Menu('main', '', ''), 'personalize': Menu('personalize', '', '') } for key, item in _app.config['menubuilder_map'].iteritems(): start = menu if '.' not in key: if key in menu: menu[key] = item.children.update(menu[key].children) else: menu[key] = item continue keys = key.split('.') for k in keys[:-1]: try: start = start[k].children except: start[k] = Menu() start = start[k].children if keys[-1] in start: item.children.update(start[keys[-1]].children) start[keys[-1]] = item _app.config['menubuilder_map'] = menu # Flask-Admin from invenio.adminutils import register_admin register_admin(_app) try: ## When deploying Invenio, one can prepare a module called ## webinterface_handler_local.py to be deployed under ## CFG_PYLIBDIR/invenio directory, and containing a function called ## customize_app which should accept a Flask application. ## This function has a chance to modify the application as needed ## including changing the URL routing map. # pylint: disable=E0611 from invenio.webinterface_handler_local import customize_app # pylint: enable=E0611 customize_app(_app) except ImportError: ## No customization needed. pass return _app
def generate_sitemaps(collection_names, fulltext_filter=''): """ Generate sitemaps themselves. Return list of generated sitemaps files """ sitemap_id = 1 writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id) sitemaps = [writer.get_name()] nb_urls = 0 for [lang, lang_name] in language_list_long(): writer.add_url(CFG_SITE_URL + '/?ln=%s' % lang, lastmod=datetime.today(), changefreq=DEFAULT_CHANGEFREQ_HOME, priority=DEFAULT_PRIORITY_HOME) nb_urls += 1 recids = get_all_public_records(collection_names) task_update_progress("Generating urls for %s records" % len(recids)) #task_sleep_now_if_required(can_stop_too=True) for (recid, lastmod) in recids: if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0: #print nb_urls #print writer.get_size() if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS: writer.close() sitemap_id += 1 writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id) sitemaps.append(writer.get_name()) nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s' % recid, lastmod = lastmod, changefreq = DEFAULT_CHANGEFREQ_RECORDS, priority = DEFAULT_PRIORITY_RECORDS) #task_sleep_now_if_required(can_stop_too=False) task_update_progress("Generating urls for collections") for (collection, lastmod) in get_all_public_collections(collection_names): for [lang, lang_name] in language_list_long(): if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0: #print nb_urls #print writer.get_size() if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS: writer.close() sitemap_id += 1 writer = SitemapWriter('%s/sitemap-%s.xml' % (CFG_WEBDIR, sitemap_id)) sitemaps.append(writer.get_name()) nb_urls = writer.add_url( '%s/collection/%s?ln=%s' % (CFG_SITE_URL, quote(collection), lang), lastmod = lastmod, changefreq = DEFAULT_CHANGEFREQ_COLLECTIONS, priority = DEFAULT_PRIORITY_COLLECTIONS) #task_sleep_now_if_required(can_stop_too=False) task_update_progress("Generating urls for fulltexts") for (recid, lastmod) in filter_fulltexts(recids, fulltext_filter): if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0: #print nb_urls #print writer.get_size() if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS: writer.close() sitemap_id += 1 writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id) sitemaps.append(writer.get_name()) nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s/files' % recid, lastmod = lastmod, changefreq = DEFAULT_CHANGEFREQ_FULLTEXTS, priority = DEFAULT_PRIORITY_FULLTEXTS) #task_sleep_now_if_required(can_stop_too=False) task_update_progress("Generating urls for comments") for (recid, lastmod) in filter_comments(recids): if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0: #print nb_urls #print writer.get_size() if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS: writer.close() sitemap_id += 1 writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id) sitemaps.append(writer.get_name()) nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s/comments' % recid, lastmod = lastmod, changefreq = DEFAULT_CHANGEFREQ_COMMENTS, priority = DEFAULT_PRIORITY_COMMENTS) #task_sleep_now_if_required(can_stop_too=False) task_update_progress("Generating urls for reviews") for (recid, lastmod) in filter_reviews(recids): if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0: #print nb_urls #print writer.get_size() if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS: writer.close() sitemap_id += 1 writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id) sitemaps.append(writer.get_name()) nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s/reviews' % recid, lastmod = lastmod, changefreq = DEFAULT_CHANGEFREQ_REVIEWS, priority = DEFAULT_PRIORITY_REVIEWS) #task_sleep_now_if_required(can_stop_too=False) try: writer.close() except: pass return sitemaps
def get_languages(): languages = [] for (lang, lang_namelong) in language_list_long(): languages.append((lang, lang_namelong)) languages.sort() return languages
def get_month(date, ln=CFG_SITE_LANG, default=""): """ Returns the year from a textual date retrieved from a record The returned value is the 3 letters short month name in language 'ln' If year cannot be found, returns 'default' @param date the textual date to retrieve the year from @param default a default value to return if year not fount """ import re from invenio.dateutils import get_i18n_month_name from invenio.messages import language_list_long #Look for textual month like "Jan" or "sep" or "November" or "novem" #Limit to CFG_SITE_LANG as language first (most probable date) #Look for short months. Also matches for long months short_months = [get_i18n_month_name(month).lower() for month in range(1, 13)] # ["jan","feb","mar",...] short_months_pattern = re.compile(r'('+r'|'.join(short_months)+r')', re.IGNORECASE) # (jan|feb|mar|...) result = short_months_pattern.search(date) if result is not None: try: month_nb = short_months.index(result.group().lower()) + 1 return get_i18n_month_name(month_nb, "short", ln) except: pass #Look for month specified as number in the form 2004/03/08 or 17 02 2004 #(always take second group of 2 or 1 digits separated by spaces or - etc.) month_pattern = re.compile(r'\d([\s]|[-/.,])\ +(?P<month>(\d){1,2})([\s]|[-/.,])') result = month_pattern.search(date) if result is not None: try: month_nb = int(result.group("month")) return get_i18n_month_name(month_nb, "short", ln) except: pass #Look for textual month like "Jan" or "sep" or "November" or "novem" #Look for the month in each language #Retrieve ['en', 'fr', 'de', ...] language_list_short = [x[0] for x in language_list_long()] for lang in language_list_short: #For each language #Look for short months. Also matches for long months short_months = [get_i18n_month_name(month, "short", lang).lower() for month in range(1, 13)] # ["jan","feb","mar",...] short_months_pattern = re.compile(r'('+r'|'.join(short_months)+r')', re.IGNORECASE) # (jan|feb|mar|...) result = short_months_pattern.search(date) if result is not None: try: month_nb = short_months.index(result.group().lower()) + 1 return get_i18n_month_name(month_nb, "short", ln) except: pass return default
def generate_sitemaps(collection_names, fulltext_filter=''): """ Generate sitemaps themselves. Return list of generated sitemaps files """ sitemap_id = 1 writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id) sitemaps = [writer.get_name()] nb_urls = 0 for [lang, lang_name] in language_list_long(): writer.add_url(CFG_SITE_URL + '/?ln=%s' % lang, lastmod=datetime.today(), changefreq=DEFAULT_CHANGEFREQ_HOME, priority=DEFAULT_PRIORITY_HOME) nb_urls += 1 recids = get_all_public_records(collection_names) task_update_progress("Generating urls for %s records" % len(recids)) #task_sleep_now_if_required(can_stop_too=True) for (recid, lastmod) in recids: if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0: #print nb_urls #print writer.get_size() if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS: writer.close() sitemap_id += 1 writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id) sitemaps.append(writer.get_name()) nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s' % recid, lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_RECORDS, priority=DEFAULT_PRIORITY_RECORDS) #task_sleep_now_if_required(can_stop_too=False) task_update_progress("Generating urls for collections") for (collection, lastmod) in get_all_public_collections(collection_names): for [lang, lang_name] in language_list_long(): if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0: #print nb_urls #print writer.get_size() if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS: writer.close() sitemap_id += 1 writer = SitemapWriter('%s/sitemap-%s.xml' % (CFG_WEBDIR, sitemap_id)) sitemaps.append(writer.get_name()) nb_urls = writer.add_url('%s/collection/%s?ln=%s' % (CFG_SITE_URL, quote(collection), lang), lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_COLLECTIONS, priority=DEFAULT_PRIORITY_COLLECTIONS) #task_sleep_now_if_required(can_stop_too=False) task_update_progress("Generating urls for fulltexts") for (recid, lastmod) in filter_fulltexts(recids, fulltext_filter): if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0: #print nb_urls #print writer.get_size() if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS: writer.close() sitemap_id += 1 writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id) sitemaps.append(writer.get_name()) nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s/files' % recid, lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_FULLTEXTS, priority=DEFAULT_PRIORITY_FULLTEXTS) #task_sleep_now_if_required(can_stop_too=False) task_update_progress("Generating urls for comments") for (recid, lastmod) in filter_comments(recids): if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0: #print nb_urls #print writer.get_size() if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS: writer.close() sitemap_id += 1 writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id) sitemaps.append(writer.get_name()) nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s/comments' % recid, lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_COMMENTS, priority=DEFAULT_PRIORITY_COMMENTS) #task_sleep_now_if_required(can_stop_too=False) task_update_progress("Generating urls for reviews") for (recid, lastmod) in filter_reviews(recids): if nb_urls <= MAX_RECORDS and nb_urls % 100 == 0: #print nb_urls #print writer.get_size() if writer.get_size() > MAX_SIZE or nb_urls == MAX_RECORDS: writer.close() sitemap_id += 1 writer = SitemapWriter(CFG_WEBDIR + '/sitemap-%s.xml' % sitemap_id) sitemaps.append(writer.get_name()) nb_urls = writer.add_url(CFG_SITE_URL + '/record/%s/reviews' % recid, lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_REVIEWS, priority=DEFAULT_PRIORITY_REVIEWS) #task_sleep_now_if_required(can_stop_too=False) try: writer.close() except: pass return sitemaps
\s* #any number of white spaces > #closing <lang> start tag (?P<langs>.*?) #anything but the next group (greedy) (</lang\s*>) #end tag ''', re.IGNORECASE | re.DOTALL | re.VERBOSE) # Regular expression for finding <en>...</en> tag (particular case of # pattern_lang) pattern_CFG_SITE_LANG = re.compile(r"<("+CFG_SITE_LANG+ \ r")\s*>(.*?)(</"+CFG_SITE_LANG+r"\s*>)", re.IGNORECASE | re.DOTALL) # Builds regular expression for finding each known language in <lang> tags ln_pattern_text = r"<(?P<lang>" ln_pattern_text += r"|".join([lang[0] for lang in \ language_list_long(enabled_langs_only=False)]) ln_pattern_text += r')\s*(revision="[^"]"\s*)?>(?P<translation>.*?)</\1>' ln_pattern = re.compile(ln_pattern_text, re.IGNORECASE | re.DOTALL) defined_tags = {'<CFG_SITE_NAME>': CFG_SITE_NAME, '<CFG_SITE_SUPPORT_EMAIL>': CFG_SITE_SUPPORT_EMAIL, '<CFG_SITE_ADMIN_EMAIL>': CFG_SITE_ADMIN_EMAIL, '<CFG_SITE_URL>': CFG_SITE_URL, '<CFG_SITE_SECURE_URL>': CFG_SITE_SECURE_URL, '<CFG_VERSION>': CFG_VERSION, '<CFG_SITE_NAME_INTL>': CFG_SITE_NAME_INTL} def get_webdoc_parts(webdoc, parts=['title', \ 'keywords', \ 'navtrail', \
\s* #any number of white spaces > #closing <lang> start tag (?P<langs>.*?) #anything but the next group (greedy) (</lang\s*>) #end tag ''', re.IGNORECASE | re.DOTALL | re.VERBOSE) # Regular expression for finding <en>...</en> tag (particular case of # pattern_lang) pattern_CFG_SITE_LANG = re.compile(r"<("+CFG_SITE_LANG+ \ r")\s*>(.*?)(</"+CFG_SITE_LANG+r"\s*>)", re.IGNORECASE | re.DOTALL) # Builds regular expression for finding each known language in <lang> tags ln_pattern_text = r"<(?P<lang>" ln_pattern_text += r"|".join([lang[0] for lang in \ language_list_long(enabled_langs_only=False)]) ln_pattern_text += r')\s*(revision="[^"]"\s*)?>(?P<translation>.*?)</\1>' ln_pattern = re.compile(ln_pattern_text, re.IGNORECASE | re.DOTALL) defined_tags = { '<CFG_SITE_NAME>': CFG_SITE_NAME, '<CFG_SITE_SUPPORT_EMAIL>': CFG_SITE_SUPPORT_EMAIL, '<CFG_SITE_ADMIN_EMAIL>': CFG_SITE_ADMIN_EMAIL, '<CFG_SITE_URL>': CFG_SITE_URL, '<CFG_SITE_SECURE_URL>': CFG_SITE_SECURE_URL, '<CFG_VERSION>': CFG_VERSION, '<CFG_SITE_NAME_INTL>': CFG_SITE_NAME_INTL } def get_webdoc_parts(webdoc, parts=['title', \