def flush(self): self.send_http_header() if self.__buffer: self.__bytes_sent += len(self.__buffer) try: if not self.__write_error: if self.__replace_https: self.__write(https_replace(self.__buffer)) else: if self.__buffer: self.__write(self.__buffer) if self.track_writings: if self.__replace_https: self.__what_was_written += https_replace( self.__buffer) else: self.__what_was_written += self.__buffer except IOError as err: if "failed to write data" in str( err) or "client connection closed" in str(err): ## Let's just log this exception without alerting the admin: register_exception(req=self) self.__write_error = True ## This flag is there just ## to not report later other errors to the admin. else: raise self.__buffer = ''
def _mint_pid(obj, dummy_eng): d = Deposition(obj) recjson = d.get_latest_sip(sealed=False).metadata if 'recid' not in recjson: raise Exception("'recid' not found in sip metadata.") pid_text = None pid = recjson.get(pid_field, None) if not pid: # No pid found in recjson, so create new pid with user supplied # function. pid_text = recjson[pid_field] = pid_creator(recjson) else: # Pid found - check if it should be minted if existing_pid_checker and existing_pid_checker(pid, recjson): pid_text = pid # Create an assign pid internally - actually registration will happen # asynchronously later. if pid_text: current_app.logger.info("Registering pid %s" % pid_text) pid_obj = PersistentIdentifier.create(pid_store_type, pid_text) if pid_obj is None: pid_obj = PersistentIdentifier.get(pid_store_type, pid_text) try: pid_obj.assign("rec", recjson['recid']) except Exception: register_exception(alert_admin=True) d.update()
def __call__(self, req, form): """Return deprecation warning.""" try: from invenio.legacy.webpage import page except ImportError: register_exception() def page(*args): return args[1] req.status = apache.HTTP_SERVICE_UNAVAILABLE msg = "<p>This functionality will be soon deprecated.</p>" try: from invenio.config import CFG_SITE_ADMIN_EMAIL msg += ( """<p>If you would still like to use it, please ask your Invenio administrator <code>%s</code> to consider enabling it. </p>""" % CFG_SITE_ADMIN_EMAIL ) except ImportError: pass try: return page("Service disabled", msg, req=req) except Exception: return msg
def process_alerts(alerts): """Process the given alerts and store the records found to the user defined baskets and/or notify them by e-mail""" # TBD: do not generate the email each time, forge it once and then # send it to all appropriate people for a in alerts['alerts']: if alert_use_basket_p(a): add_records_to_basket(alerts['records'], a[2]) if alert_use_notification_p(a): argstr = update_arguments(alerts['argstr'], alerts['date_from'], alerts['date_until']) try: email_notify(a, alerts['records'], argstr) except Exception: # There were troubles sending this alert, so register # this exception and continue with other alerts: register_exception(alert_admin=True, prefix="Error when sending alert %s, %s\n." % \ (repr(a), repr(argstr))) # Inform the admin when external collections time out if len(alerts['records'][1][1]) > 0: register_exception(alert_admin=True, prefix="External collections %s timed out when sending alert %s, %s\n." % \ (", ".join(alerts['records'][1][1]), repr(a), repr(argstr))) update_date_lastrun(a)
def cache_whatsNew(html, journal_name, issue, ln): """ caches the whats new box for 30 minutes. """ if not CFG_ACCESS_CONTROL_LEVEL_SITE == 2: issue = issue.replace("/", "_") issue_number, year = issue.split("_", 1) cache_path = os.path.abspath('%s/webjournal/%s/%s/%s/whatsNew_%s.html' % \ (CFG_CACHEDIR, journal_name, year, issue_number, ln)) if cache_path.startswith(CFG_CACHEDIR + '/webjournal'): # Do not try to cache if the journal name led us to some # other directory ('../../' inside journal name for # example) try: cache_dir = os.path.dirname(cache_path) if not os.path.isdir(cache_dir): os.makedirs(cache_dir) cache_file = file(cache_path, "w") cache_file.write(html) cache_file.close() except Exception: register_exception(req=None, prefix="Could not store 'Whats new' section", alert_admin=True)
def goto_handler(req, form): ## Let's put what is in the GET query for key, value in dict(form).items(): if key in params_to_pass: params_to_pass[key] = str(value) ## Let's override the params_to_pass to the call with the ## arguments in the configuration configuration_parameters = redirection_data['parameters'] or {} params_to_pass.update(configuration_parameters) ## Let's add default parameters if the plugin expects them if 'component' in params_to_pass: params_to_pass['component'] = component if 'path' in params_to_pass: params_to_pass['path'] = path if 'user_info' in params_to_pass: params_to_pass['user_info'] = collect_user_info(req) if 'req' in params_to_pass: params_to_pass['req'] = req try: new_url = goto_plugin(**params_to_pass) except Exception as err: register_exception(req=req, alert_admin=True) raise SERVER_RETURN(HTTP_NOT_FOUND) if new_url: if new_url.startswith('/'): new_url = CFG_SITE_URL + new_url redirect_to_url(req, new_url) else: raise SERVER_RETURN(HTTP_NOT_FOUND)
def oai_list_records_or_identifiers(req, argd): """Generates response to oai_list_records verb.""" verb = argd["verb"] resumption_token_was_specified = False # check if the resumption_token did not expire if argd.get("resumptionToken"): resumption_token_was_specified = True try: cache = oai_cache_load(argd["resumptionToken"]) last_recid = cache["last_recid"] argd = cache["argd"] complete_list = cache["complete_list"] complete_list = filter_out_based_on_date_range(complete_list, argd.get("from", ""), argd.get("until", "")) except Exception, e: # Ignore cache not found errors if not isinstance(e, IOError) or e.errno != 2: register_exception(alert_admin=True) req.write( oai_error( argd, [("badResumptionToken", "ResumptionToken expired or invalid: %s" % argd["resumptionToken"])] ) ) return
def calculate_RFC2104_HMAC(data, _amazon_secret_access_key): """ Computes a RFC 2104 compliant HMAC Signature and then Base64 encodes it. Module hashlib must be installed if Python < 2.5 <http://pypi.python.org/pypi/hashlib/20081119> @param data: data to sign @param _amazon_secret_access_key: your Amazon secret key @type data: string @type _amazon_secret_access_key: string. Empty if hashlib module not installed """ if not HASHLIB_IMPORTED: try: raise Exception( "Module hashlib not installed. Please install it.") except: from invenio.ext.logging import register_exception register_exception(stream='warning', alert_admin=True, subject='Cannot create AWS signature') return "" else: if sys.version_info < (2, 5): # compatibility mode for Python < 2.5 and hashlib my_digest_algo = _MySHA256(sha256()) else: my_digest_algo = sha256 return base64.encodestring( hmac.new(_amazon_secret_access_key, data, my_digest_algo).digest()).strip()
def sendfile(self, path, offset=0, the_len=-1): try: self.send_http_header() file_to_send = open(path) file_to_send.seek(offset) file_wrapper = FileWrapper(file_to_send) count = 0 if the_len < 0: for chunk in file_wrapper: count += len(chunk) self.__bytes_sent += len(chunk) self.__write(chunk) else: for chunk in file_wrapper: if the_len >= len(chunk): the_len -= len(chunk) count += len(chunk) self.__bytes_sent += len(chunk) self.__write(chunk) else: count += the_len self.__bytes_sent += the_len self.__write(chunk[:the_len]) break except IOError as err: if "failed to write data" in str(err) or "client connection closed" in str(err): ## Let's just log this exception without alerting the admin: register_exception(req=self) else: raise return self.__bytes_sent
def get_oai_set(id=''): """Returns a row parameters for a given id""" sets = [] sql = "SELECT id, setSpec, setName, setCollection, setDescription, p1,f1,m1, p2,f2,m2, p3,f3,m3, setDefinition FROM oaiREPOSITORY" try: if id: sql += " WHERE id=%s" % id sql += " ORDER BY setSpec asc" res = run_sql(sql) for row in res: set = ['']*16 set[0] = row[0] set[1] = row[1] set[2] = row[2] params = parse_set_definition(row[14]) set[3] = params.get('c', '') set[5] = params.get('p1', '') set[6] = params.get('f1', '') set[7] = params.get('m1', '') set[8] = params.get('p2', '') set[9] = params.get('f2', '') set[10] = params.get('m2', '') set[11] = params.get('p3', '') set[12] = params.get('f3', '') set[13] = params.get('m3', '') set[14] = params.get('op1', 'a') set[15] = params.get('op2', 'a') sets.append(set) return sets except StandardError as e: register_exception(alert_admin=True) return str(e)
def template_context_function(id_bibrec, pattern, qid, current_user): """Return fulltext snippets. :param id_bibrec: ID of record :param pattern: search pattern :param current_user: user object :param qid: query id :return: HTML containing snippet """ if not pattern: pattern = get_pattern_from_cache(qid) if id_bibrec and pattern and current_user: # Requires search in fulltext field if CFG_WEBSEARCH_FULLTEXT_SNIPPETS and 'fulltext:' in pattern: terms = get_fulltext_terms_from_search_pattern(pattern) if terms: snippets = '' try: snippets = get_pdf_snippets( id_bibrec, terms, current_user).decode('utf8') if snippets: return ' ... ' + snippets + ' ... ' except: register_exception() return '' else: return '' else: return None
def fix_bibdoc_bibdoc(id_bibdoc1, id_bibdoc2, logfile): """ Migrate an icon. """ try: the_bibdoc = BibDoc.create_instance(id_bibdoc1) except Exception as err: msg = "WARNING: when opening docid %s: %s" % (id_bibdoc1, err) print(msg, file=logfile) print(msg) return True try: msg = "Fixing icon for the document %s" % (id_bibdoc1, ) print(msg, end=' ') print(msg, end=' ', file=logfile) the_icon = BibDoc.create_instance(id_bibdoc2) for a_file in the_icon.list_latest_files(): the_bibdoc.add_icon(a_file.get_full_path(), format=a_file.get_format()) the_icon.delete() run_sql( "DELETE FROM bibdoc_bibdoc WHERE id_bibdoc1=%s AND id_bibdoc2=%s", (id_bibdoc1, id_bibdoc2)) print("OK") print("OK", file=logfile) return True except Exception as err: print("ERROR: %s" % err) print("ERROR: %s" % err, file=logfile) register_exception() return False
def template_context_function(id_bibrec, pattern, qid, current_user): """Return fulltext snippets. :param id_bibrec: ID of record :param pattern: search pattern :param current_user: user object :param qid: query id :return: HTML containing snippet """ if not pattern: pattern = get_pattern_from_cache(qid) if id_bibrec and pattern and current_user: # Requires search in fulltext field if CFG_WEBSEARCH_FULLTEXT_SNIPPETS and 'fulltext:' in pattern: terms = get_fulltext_terms_from_search_pattern(pattern) if terms: snippets = '' try: snippets = get_pdf_snippets(id_bibrec, terms, current_user).decode('utf8') if snippets: return ' ... ' + snippets + ' ... ' except: register_exception() return '' else: return '' else: return None
def get_formatted_data(self, of=None): """Get the formatted representation for this object.""" from .registry import workflows if of is None: of = cfg.get("WORKFLOWS_HOLDING_PEN_DEFAULT_OUTPUT_FORMAT") try: name = self.get_workflow_name() if not name: return "" # TODO: this can be removed when workflow refactoring is done if not hasattr(self, "data"): self.data = self.get_data() if not hasattr(self, "extra_data"): self.extra_data = self.get_extra_data() workflow_definition = workflows[name] formatted_data = workflow_definition.formatter( self, of=of ) except (KeyError, AttributeError): # Somehow the workflow or formatter does not exist from invenio.ext.logging import register_exception register_exception(alert_admin=True) formatted_data = "" return formatted_data
def solr_commit_if_necessary(next_commit_counter, final_commit=False, recid=None): # Counter full or final commit if counter set if next_commit_counter == task_get_option("flush") - 1 or (final_commit and next_commit_counter > 0): recid_info = '' if recid: recid_info = ' for recid=%s' % recid status_msg = 'Solr ranking indexer COMMITTING' + recid_info write_message(status_msg) task_update_progress(status_msg) try: # Commits might cause an exception, most likely a # timeout while hitting a background merge # Changes will then be committed later by the # calling (periodical) task # Also, autocommits can be used in the solrconfig SOLR_CONNECTION.commit() except: register_exception(alert_admin=True) next_commit_counter = 0 task_sleep_now_if_required(can_stop_too=True) else: next_commit_counter = next_commit_counter + 1 return next_commit_counter
def perform_request_leave_group(uid, grpID, confirmed=0, ln=CFG_SITE_LANG): """Leave group. @param uid: user ID @param grpID: ID of the group the user wants to leave @param warnings: warnings != [] if 0 group is selected @param confirmed: a confirmed page is first displayed @param ln: language @return: body with warnings """ _ = gettext_set_language(ln) body = "" warnings = [] infos = [] if not grpID == -1: if confirmed: db.leave_group(grpID, uid) infos.append(CFG_WEBSESSION_INFO_MESSAGES["LEAVE_GROUP"]) body = perform_request_groups_display(uid, infos=infos, warnings=warnings, ln=ln) else: body = websession_templates.tmpl_confirm_leave(uid, grpID, ln) else: try: raise InvenioWebSessionWarning(_('Please select one group.')) except InvenioWebSessionWarning as exc: register_exception(stream='warning') warnings.append(exc.message) body = perform_request_input_leave_group(uid, warnings= warnings, ln=ln) return body
def oai_list_records_or_identifiers(req, argd): """Generates response to oai_list_records verb.""" verb = argd['verb'] resumption_token_was_specified = False # check if the resumption_token did not expire if argd.get('resumptionToken'): resumption_token_was_specified = True try: cache = oai_cache_load(argd['resumptionToken']) last_recid = cache['last_recid'] argd = cache['argd'] complete_list = cache['complete_list'] complete_list = filter_out_based_on_date_range( complete_list, argd.get('from', ''), argd.get('until', '')) except Exception, e: # Ignore cache not found errors if not isinstance(e, IOError) or e.errno != 2: register_exception(alert_admin=True) req.write( oai_error(argd, [("badResumptionToken", "ResumptionToken expired or invalid: %s" % argd['resumptionToken'])])) return
def calculate_RFC2104_HMAC(data, _amazon_secret_access_key): """ Computes a RFC 2104 compliant HMAC Signature and then Base64 encodes it. Module hashlib must be installed if Python < 2.5 <http://pypi.python.org/pypi/hashlib/20081119> @param data: data to sign @param _amazon_secret_access_key: your Amazon secret key @type data: string @type _amazon_secret_access_key: string. Empty if hashlib module not installed """ if not HASHLIB_IMPORTED: try: raise Exception("Module hashlib not installed. Please install it.") except: from invenio.ext.logging import register_exception register_exception(stream="warning", alert_admin=True, subject="Cannot create AWS signature") return "" else: if sys.version_info < (2, 5): # compatibility mode for Python < 2.5 and hashlib my_digest_algo = _MySHA256(sha256()) else: my_digest_algo = sha256 return base64.encodestring(hmac.new(_amazon_secret_access_key, data, my_digest_algo).digest()).strip()
def application(environ, start_response, handler=None): """ Entry point for wsgi. """ ## Needed for mod_wsgi, see: <http://code.google.com/p/modwsgi/wiki/ApplicationIssues> req = SimulatedModPythonRequest(environ, start_response) #print 'Starting mod_python simulation' try: if handler is None: from invenio.ext.legacy.layout import invenio_handler invenio_handler(req) else: handler(req) req.flush() ## TODO for future reimplementation of stream_file #except StreamFileException as e: # return e.value except SERVER_RETURN as status: redirection, = status.args from werkzeug.wrappers import BaseResponse if isinstance(redirection, BaseResponse): return redirection status = int(str(status)) if status == 404: from werkzeug.exceptions import NotFound raise NotFound() if status not in (OK, DONE): req.status = status req.headers_out['content-type'] = 'text/html' admin_to_be_alerted = alert_admin_for_server_status_p(status, req.headers_in.get('referer')) if admin_to_be_alerted: register_exception(req=req, alert_admin=True) if not req.response_sent_p: start_response(req.get_wsgi_status(), req.get_low_level_headers(), sys.exc_info()) map(req.write, generate_error_page(req, admin_to_be_alerted)) req.flush() finally: ##for (callback, data) in req.get_cleanups(): ## callback(data) #if hasattr(req, '_session'): # ## The session handler saves for caching a request_wrapper # ## in req. # ## This saves req as an attribute, creating a circular # ## reference. # ## Since we have have reached the end of the request handler # ## we can safely drop the request_wrapper so to avoid # ## memory leaks. # delattr(req, '_session') #if hasattr(req, '_user_info'): # ## For the same reason we can delete the user_info. # delattr(req, '_user_info') ## as suggested in ## <http://www.python.org/doc/2.3.5/lib/module-gc.html> del gc.garbage[:] return req.response
def perform_request_delete_msg(uid, msgid, ln=CFG_SITE_LANG): """ Delete a given message from user inbox @param uid: user id (int) @param msgid: message id (int) @param ln: language @return: body with warnings """ _ = gettext_set_language(ln) warnings = [] infos = [] body = "" if (db.check_user_owns_message(uid, msgid) == 0): # The user doesn't own this message try: raise InvenioWebMessageError(_('Sorry, this message in not in your mailbox.')) except InvenioWebMessageError as exc: register_exception() body = webmessage_templates.tmpl_error(exc.message, ln) return body else: if (db.delete_message_from_user_inbox(uid, msgid) == 0): warnings.append(_("The message could not be deleted.")) else: infos.append(_("The message was successfully deleted.")) return perform_request_display(uid, warnings, infos, ln)
def fix_bibdoc_bibdoc(id_bibdoc1, id_bibdoc2, logfile): """ Migrate an icon. """ try: the_bibdoc = BibDoc.create_instance(id_bibdoc1) except Exception as err: msg = "WARNING: when opening docid %s: %s" % (id_bibdoc1, err) print(msg, file=logfile) print(msg) return True try: msg = "Fixing icon for the document %s" % (id_bibdoc1, ) print(msg, end=' ') print(msg, end=' ', file=logfile) the_icon = BibDoc.create_instance(id_bibdoc2) for a_file in the_icon.list_latest_files(): the_bibdoc.add_icon(a_file.get_full_path(), format=a_file.get_format()) the_icon.delete() run_sql("DELETE FROM bibdoc_bibdoc WHERE id_bibdoc1=%s AND id_bibdoc2=%s", (id_bibdoc1, id_bibdoc2)) print("OK") print("OK", file=logfile) return True except Exception as err: print("ERROR: %s" % err) print("ERROR: %s" % err, file=logfile) register_exception() return False
def _create_icon(file_path, icon_size, format="gif", verbosity=9): """ Creates icon of given file. Returns path to the icon. If creation fails, return None, and register exception (send email to admin). Parameters: - file_path : *str* full path to icon - icon_size : *int* the scaling information to be used for the creation of the new icon. - verbosity : *int* the verbosity level under which the program is to run; """ icon_path = None try: filename = os.path.splitext(os.path.basename(file_path))[0] (icon_dir, icon_name) = create_icon( { "input-file": file_path, "icon-name": "icon-%s" % filename, "multipage-icon": False, "multipage-icon-delay": 0, "icon-scale": icon_size, "icon-file-format": format, "verbosity": verbosity, } ) icon_path = icon_dir + os.sep + icon_name except InvenioWebSubmitIconCreatorError as e: register_exception(prefix="Icon for file %s could not be created: %s" % (file_path, str(e)), alert_admin=False) return icon_path
def perform_request_delete_msg(uid, msgid, ln=CFG_SITE_LANG): """ Delete a given message from user inbox @param uid: user id (int) @param msgid: message id (int) @param ln: language @return: body with warnings """ _ = gettext_set_language(ln) warnings = [] infos = [] body = "" if (db.check_user_owns_message(uid, msgid) == 0): # The user doesn't own this message try: raise InvenioWebMessageError( _('Sorry, this message is not in your mailbox.')) except InvenioWebMessageError as exc: register_exception() body = webmessage_templates.tmpl_error(exc.message, ln) return body else: if (db.delete_message_from_user_inbox(uid, msgid) == 0): warnings.append(_("The message could not be deleted.")) else: infos.append(_("The message was successfully deleted.")) return perform_request_display(uid, warnings, infos, ln)
def add_oai_set(oai_set_name, oai_set_spec, oai_set_collection, oai_set_description, oai_set_p1, oai_set_f1,oai_set_m1, oai_set_p2, oai_set_f2,oai_set_m2, oai_set_p3, oai_set_f3, oai_set_m3, oai_set_op1, oai_set_op2): """Add a definition into the OAI Repository""" try: if not oai_set_spec: oai_set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC set_definition = 'c=' + oai_set_collection + ';' + \ 'p1=' + oai_set_p1 + ';' + \ 'f1=' + oai_set_f1 + ';' + \ 'm1=' + oai_set_m1 + ';' + \ 'op1='+ oai_set_op1 + ';' + \ 'p2=' + oai_set_p2 + ';' + \ 'f2=' + oai_set_f2 + ';' + \ 'm2=' + oai_set_m2 + ';' + \ 'op2='+ oai_set_op2 + ';' + \ 'p3=' + oai_set_p3 + ';' + \ 'f3=' + oai_set_f3 + ';' + \ 'm3=' + oai_set_m3 + ';' run_sql("""INSERT INTO oaiREPOSITORY (id, setName, setSpec, setCollection, setDescription, setDefinition, setRecList, p1, f1, m1, p2, f2, m2, p3, f3, m3) VALUES (0, %s, %s, %s, %s, %s, NULL, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", (oai_set_name, oai_set_spec, oai_set_collection, oai_set_description, set_definition, oai_set_p1, oai_set_f1, oai_set_m1, oai_set_p2, oai_set_f2, oai_set_m2, oai_set_p3, oai_set_f3, oai_set_m3)) return (1, "") except StandardError as e: register_exception(alert_admin=True) return (0, e)
def flush(self): self.send_http_header() if self.__buffer: self.__bytes_sent += len(self.__buffer) try: if not self.__write_error: if self.__replace_https: self.__write(https_replace(self.__buffer)) else: if self.__buffer: self.__write(self.__buffer) if self.track_writings: if self.__replace_https: self.__what_was_written += https_replace(self.__buffer) else: self.__what_was_written += self.__buffer except IOError as err: if "failed to write data" in str(err) or "client connection closed" in str(err): ## Let's just log this exception without alerting the admin: register_exception(req=self) self.__write_error = True ## This flag is there just ## to not report later other errors to the admin. else: raise self.__buffer = ""
def sub(self, req, form): """DEPRECATED: /submit/sub is deprecated now, so raise email to the admin (but allow submission to continue anyway)""" args = wash_urlargd(form, {'password': (str, '')}) uid = getUid(req) if uid == -1 or CFG_ACCESS_CONTROL_LEVEL_SITE >= 1: return page_not_authorized(req, "../sub/", navmenuid='submit') try: raise DeprecationWarning, 'submit/sub handler has been used. Please use submit/direct. e.g. "submit/sub?RN=123@SBIFOO" -> "submit/direct?RN=123&sub=SBIFOO"' except DeprecationWarning: register_exception(req=req, alert_admin=True) ln = args['ln'] _ = gettext_set_language(ln) #DEMOBOO_RN=DEMO-BOOK-2008-001&ln=en&password=1223993532.26572%40APPDEMOBOO params = dict(form) password = args['password'] if password: del params['password'] if "@" in password: params['access'], params['sub'] = password.split('@', 1) else: params['sub'] = password else: args = str(req.args).split('@') if len(args) > 1: params = {'sub': args[-1]} args = '@'.join(args[:-1]) params.update(cgi.parse_qs(args)) else: return warning_page(_("Sorry, invalid URL..."), req, ln=ln) url = "%s/submit/direct?%s" % (CFG_SITE_SECURE_URL, urlencode(params, doseq=True)) redirect_to_url(req, url)
def sub(self, req, form): """DEPRECATED: /submit/sub is deprecated now, so raise email to the admin (but allow submission to continue anyway)""" args = wash_urlargd(form, {'password': (str, '')}) uid = getUid(req) if uid == -1 or CFG_ACCESS_CONTROL_LEVEL_SITE >= 1: return page_not_authorized(req, "../sub/", navmenuid='submit') try: raise DeprecationWarning, 'submit/sub handler has been used. Please use submit/direct. e.g. "submit/sub?RN=123@SBIFOO" -> "submit/direct?RN=123&sub=SBIFOO"' except DeprecationWarning: register_exception(req=req, alert_admin=True) ln = args['ln'] _ = gettext_set_language(ln) #DEMOBOO_RN=DEMO-BOOK-2008-001&ln=en&password=1223993532.26572%40APPDEMOBOO params = dict(form) password = args['password'] if password: del params['password'] if "@" in password: params['access'], params['sub'] = password.split('@', 1) else: params['sub'] = password else: args = str(req.args).split('@') if len(args) > 1: params = {'sub' : args[-1]} args = '@'.join(args[:-1]) params.update(cgi.parse_qs(args)) else: return warning_page(_("Sorry, invalid URL..."), req, ln=ln) url = "%s/submit/direct?%s" % (CFG_SITE_SECURE_URL, urlencode(params, doseq=True)) redirect_to_url(req, url)
def perform_request_contact(req, ln, journal_name, verbose=0): """ Display contact information """ try: contact_page_template = get_journal_template('contact', journal_name, ln) except InvenioWebJournalTemplateNotFoundError as e: register_exception(req=req) return e.user_box(req) user_info = collect_user_info(req) temp_marc = '''<record> <controlfield tag="001">0</controlfield> </record>''' bfo = BibFormatObject(0, ln=ln, xml_record=temp_marc, user_info=user_info) bfo.req = req html = format_with_format_template(contact_page_template, bfo) return html
def get_format_template(filename, with_attributes=False): """ Returns the structured content of the given formate template. if 'with_attributes' is true, returns the name and description. Else 'attrs' is not returned as key in dictionary (it might, if it has already been loaded previously):: {'code':"<b>Some template code</b>" 'attrs': {'name': "a name", 'description': "a description"} } :param filename: the filename of an format template :param with_attributes: if True, fetch the attributes (names and description) for format' @return: strucured content of format template """ if not filename.lower().endswith(".xsl"): raise RuntimeError('Unsupported file type {}.'.format(filename)) format_template = {'code': ""} try: path = registry.format_templates_lookup[filename] format_file = open(path) format_template['code'] = format_file.read() format_file.close() except Exception: register_exception() return format_template
def solr_commit_if_necessary(next_commit_counter, final_commit=False, recid=None): # Counter full or final commit if counter set if next_commit_counter == task_get_option("flush") - 1 or ( final_commit and next_commit_counter > 0): recid_info = '' if recid: recid_info = ' for recid=%s' % recid status_msg = 'Solr ranking indexer COMMITTING' + recid_info write_message(status_msg) task_update_progress(status_msg) try: # Commits might cause an exception, most likely a # timeout while hitting a background merge # Changes will then be committed later by the # calling (periodical) task # Also, autocommits can be used in the solrconfig SOLR_CONNECTION.commit() except: register_exception(alert_admin=True) next_commit_counter = 0 task_sleep_now_if_required(can_stop_too=True) else: next_commit_counter = next_commit_counter + 1 return next_commit_counter
def get_remote_ip(self): if ( "X-FORWARDED-FOR" in self.__headers_in and self.__headers_in.get("X-FORWARDED-SERVER", "") == self.__headers_in.get("X-FORWARDED-HOST", "") == urlparse(CFG_SITE_URL)[1] ): # we are using proxy setup if self.__environ.get("REMOTE_ADDR") in CFG_WEBSTYLE_REVERSE_PROXY_IPS: # we trust this proxy ip_list = self.__headers_in["X-FORWARDED-FOR"].split(",") for ip in ip_list: if _RE_IPADDRESS_START.match(ip): return ip # no IP has the correct format, return a default IP return "10.0.0.10" else: # we don't trust this proxy register_exception( prefix="You are running in a proxy configuration, but the " + "CFG_WEBSTYLE_REVERSE_PROXY_IPS variable does not contain " + "the IP of your proxy, thus the remote IP addresses of your " + "clients are not trusted. Please configure this variable.", alert_admin=True, ) return "10.0.0.11" return request.remote_addr
def determineDataType(data): # If data is a dictionary and contains type key, # we can directly derive the data_type if isinstance(data, dict): if 'type' in data: data_type = data['type'] else: data_type = 'dict' else: # If data is not a dictionary, we try to guess MIME type # by using magic library try: from magic import Magic mime_checker = Magic(mime=True) data_type = mime_checker.from_buffer(data) # noqa except: register_exception( stream="warning", prefix="BibWorkflowObject.determineDataType:" " Impossible to resolve data type." ) data_type = "" return data_type
def browse_pattern_phrases(req, colls, p, f, rg, ln=CFG_SITE_LANG): """Returns either biliographic phrases or words indexes.""" ## is p enclosed in quotes? (coming from exact search) if p.startswith('"') and p.endswith('"'): p = p[1:-1] ## okay, "real browse" follows: ## FIXME: the maths in the get_nearest_terms_in_bibxxx is just a test if not f and p.find(":") > 0: # does 'p' contain ':'? f, p = p.split(":", 1) coll_hitset = intbitset() for coll_name in colls: coll_hitset |= get_collection_reclist(coll_name) index_id = get_index_id_from_field(f) if index_id != 0: browsed_phrases_in_colls = get_nearest_terms_in_idxphrase_with_collection( p, index_id, rg / 2, rg / 2, coll_hitset) else: browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1) while not browsed_phrases: # try again and again with shorter and shorter pattern: try: p = p[:-1] browsed_phrases = get_nearest_terms_in_bibxxx( p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1) except: register_exception(req=req, alert_admin=True) # probably there are no hits at all: return [] ## try to check hits in these particular collection selection: browsed_phrases_in_colls = [] if 0: for phrase in browsed_phrases: phrase_hitset = intbitset() phrase_hitsets = search_pattern("", phrase, f, 'e') for coll in colls: phrase_hitset.union_update(phrase_hitsets[coll]) if len(phrase_hitset) > 0: # okay, this phrase has some hits in colls, so add it: browsed_phrases_in_colls.append( [phrase, len(phrase_hitset)]) ## were there hits in collections? if browsed_phrases_in_colls == []: if browsed_phrases != []: #write_warning(req, """<p>No match close to <em>%s</em> found in given collections. #Please try different term.<p>Displaying matches in any collection...""" % p_orig) ## try to get nbhits for these phrases in any collection: for phrase in browsed_phrases: nbhits = get_nbhits_in_bibxxx(phrase, f, coll_hitset) if nbhits > 0: browsed_phrases_in_colls.append([phrase, nbhits]) return browsed_phrases_in_colls
def _create_icon(file_path, icon_size, format='gif', verbosity=9): """ Creates icon of given file. Returns path to the icon. If creation fails, return None, and register exception (send email to admin). Parameters: - file_path : *str* full path to icon - icon_size : *int* the scaling information to be used for the creation of the new icon. - verbosity : *int* the verbosity level under which the program is to run; """ icon_path = None try: filename = os.path.splitext(os.path.basename(file_path))[0] (icon_dir, icon_name) = create_icon({ 'input-file': file_path, 'icon-name': "icon-%s" % filename, 'multipage-icon': False, 'multipage-icon-delay': 0, 'icon-scale': icon_size, 'icon-file-format': format, 'verbosity': verbosity }) icon_path = icon_dir + os.sep + icon_name except InvenioWebSubmitIconCreatorError as e: register_exception(prefix='Icon for file %s could not be created: %s' % \ (file_path, str(e)), alert_admin=False) return icon_path
def summary(self, req, form): args = wash_urlargd( form, { 'doctype': (str, ''), 'act': (str, ''), 'access': (str, ''), 'indir': (str, '') }) ln = args['ln'] uid = getUid(req) if uid == -1 or CFG_ACCESS_CONTROL_LEVEL_SITE >= 1: return page_not_authorized(req, "../summary", navmenuid='submit') t = "" curdir = os.path.join(CFG_WEBSUBMIT_STORAGEDIR, args['indir'], args['doctype'], args['access']) try: assert (curdir == os.path.abspath(curdir)) except AssertionError: register_exception( req=req, alert_admin=True, prefix= 'Possible cracking tentative: indir="%s", doctype="%s", access="%s"' % (args['indir'], args['doctype'], args['access'])) return warning_page("Invalid parameters", req, ln) subname = "%s%s" % (args['act'], args['doctype']) res = run_sql( "select sdesc,fidesc,pagenb,level from sbmFIELD where subname=%s " "order by pagenb,fieldnb", (subname, )) nbFields = 0 values = [] for arr in res: if arr[0] != "": val = { 'mandatory': (arr[3] == 'M'), 'value': '', 'page': arr[2], 'name': arr[0], } if os.path.exists(os.path.join(curdir, curdir, arr[1])): fd = open(os.path.join(curdir, arr[1]), "r") value = fd.read() fd.close() value = value.replace("\n", " ") value = value.replace("Select:", "") else: value = "" val['value'] = value values.append(val) return websubmit_templates.tmpl_submit_summary( ln=args['ln'], values=values, )
def display(self, req, form): """Display search history page. A misnomer.""" argd = wash_urlargd(form, {'p': (str, "n") }) uid = getUid(req) # load the right language _ = gettext_set_language(argd['ln']) if CFG_ACCESS_CONTROL_LEVEL_SITE >= 1: return page_not_authorized(req, "%s/youralerts/display" % \ (CFG_SITE_SECURE_URL,), navmenuid="youralerts") elif uid == -1 or isGuestUser(uid): return redirect_to_url(req, "%s/youraccount/login%s" % ( CFG_SITE_SECURE_URL, make_canonical_urlargd({ 'referer' : "%s/youralerts/display%s" % ( CFG_SITE_SECURE_URL, make_canonical_urlargd(argd, {})), "ln" : argd['ln']}, {}))) user_info = collect_user_info(req) if not user_info['precached_usealerts']: return page_not_authorized(req, "../", \ text = _("You are not authorized to use alerts.")) if argd['p'] == 'y': _title = _("Popular Searches") else: _title = _("Your Searches") # register event in webstat if user_info['email']: user_str = "%s (%d)" % (user_info['email'], user_info['uid']) else: user_str = "" try: register_customevent("alerts", ["display", "", user_str]) except: register_exception(suffix="Do the webstat tables exists? Try with 'webstatadmin --load-config'") return page(title=_title, body=webalert.perform_display(argd['p'], uid, ln=argd['ln']), navtrail= """<a class="navtrail" href="%(sitesecureurl)s/youraccount/display?ln=%(ln)s">%(account)s</a>""" % { 'sitesecureurl' : CFG_SITE_SECURE_URL, 'ln': argd['ln'], 'account' : _("Your Account"), }, description=_("%(sitename)s Personalize, Display searches", sitename=CFG_SITE_NAME_INTL.get(argd['ln'], CFG_SITE_NAME)), keywords=_("%(sitename)s, personalize", sitename=CFG_SITE_NAME_INTL.get(argd['ln'], CFG_SITE_NAME)), uid=uid, language=argd['ln'], req=req, lastupdated=__lastupdated__, navmenuid='youralerts', secure_page_p=1)
def worker(self): try: return import_string('invenio.modules.workflows.workers.%s:%s' % ( cfg['CFG_BIBWORKFLOW_WORKER'], cfg['CFG_BIBWORKFLOW_WORKER'])) except: from invenio.ext.logging import register_exception ## Let's report about broken plugins register_exception(alert_admin=True)
def modify_oai_set(oai_set_id, oai_set_name, oai_set_spec, oai_set_collection, oai_set_description, oai_set_p1, oai_set_f1,oai_set_m1, oai_set_p2, oai_set_f2, oai_set_m2, oai_set_p3, oai_set_f3, oai_set_m3, oai_set_op1, oai_set_op2): """Modifies a row's parameters""" try: if not oai_set_spec: oai_set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC set_definition = 'c=' + oai_set_collection + ';' + \ 'p1=' + oai_set_p1 + ';' + \ 'f1=' + oai_set_f1 + ';' + \ 'm1=' + oai_set_m1 + ';' + \ 'op1='+ oai_set_op1 + ';' + \ 'p2=' + oai_set_p2 + ';' + \ 'f2=' + oai_set_f2 + ';' + \ 'm2=' + oai_set_m2 + ';' + \ 'op2='+ oai_set_op2 + ';' + \ 'p3=' + oai_set_p3 + ';' + \ 'f3=' + oai_set_f3 + ';' + \ 'm3=' + oai_set_m3 + ';' run_sql("""UPDATE oaiREPOSITORY SET setName=%s, setSpec=%s, setCollection=%s, setDescription=%s, setDefinition=%s, p1=%s, f1=%s, m1=%s, p2=%s, f2=%s, m2=%s, p3=%s, f3=%s, m3=%s WHERE id=%s""", (oai_set_name, oai_set_spec, oai_set_collection, oai_set_description, set_definition, oai_set_p1, oai_set_f1, oai_set_m1, oai_set_p2, oai_set_f2, oai_set_m2, oai_set_p3, oai_set_f3, oai_set_m3, oai_set_id)) return (1, "") except StandardError as e: register_exception(alert_admin=True) return (0, str(e))
def browse_pattern_phrases(req, colls, p, f, rg, ln=CFG_SITE_LANG): """Returns either biliographic phrases or words indexes.""" ## is p enclosed in quotes? (coming from exact search) if p.startswith('"') and p.endswith('"'): p = p[1:-1] ## okay, "real browse" follows: ## FIXME: the maths in the get_nearest_terms_in_bibxxx is just a test if not f and p.find(":") > 0: # does 'p' contain ':'? f, p = p.split(":", 1) coll_hitset = intbitset() for coll_name in colls: coll_hitset |= get_collection_reclist(coll_name) index_id = get_index_id_from_field(f) if index_id != 0: browsed_phrases_in_colls = get_nearest_terms_in_idxphrase_with_collection( p, index_id, rg / 2, rg / 2, coll_hitset ) else: browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1) while not browsed_phrases: # try again and again with shorter and shorter pattern: try: p = p[:-1] browsed_phrases = get_nearest_terms_in_bibxxx(p, f, (rg + 1) / 2 + 1, (rg - 1) / 2 + 1) except: register_exception(req=req, alert_admin=True) # probably there are no hits at all: return [] ## try to check hits in these particular collection selection: browsed_phrases_in_colls = [] if 0: for phrase in browsed_phrases: phrase_hitset = intbitset() phrase_hitsets = search_pattern("", phrase, f, "e") for coll in colls: phrase_hitset.union_update(phrase_hitsets[coll]) if len(phrase_hitset) > 0: # okay, this phrase has some hits in colls, so add it: browsed_phrases_in_colls.append([phrase, len(phrase_hitset)]) ## were there hits in collections? if browsed_phrases_in_colls == []: if browsed_phrases != []: # write_warning(req, """<p>No match close to <em>%s</em> found in given collections. # Please try different term.<p>Displaying matches in any collection...""" % p_orig) ## try to get nbhits for these phrases in any collection: for phrase in browsed_phrases: nbhits = get_nbhits_in_bibxxx(phrase, f, coll_hitset) if nbhits > 0: browsed_phrases_in_colls.append([phrase, nbhits]) return browsed_phrases_in_colls
def delete_oai_set(oai_set_id): """""" try: run_sql("DELETE FROM oaiREPOSITORY WHERE id=%s", (oai_set_id,)) return (1, "") except StandardError as e: register_exception(alert_admin=True) return (0, e)
def ticket_submit(self, uid=None, subject="", recordid=-1, text="", queue="", priority="", owner="", requestor=""): """creates a ticket. Returns ticket_id on success, otherwise None""" if not EMAIL_SUBMIT_CONFIGURED: register_exception( stream='warning', subject='bibcatalog email not configured', prefix= "please configure bibcatalog email sending in CFG_BIBCATALOG_SYSTEM and CFG_BIBCATALOG_SYSTEM_EMAIL_ADDRESS" ) ticket_id = self._get_ticket_id() priorityset = "" queueset = "" requestorset = "" ownerset = "" recidset = " cf-recordID: %s\n" % recordid textset = "" subjectset = "" if subject: subjectset = 'ticket #%s - %s' % (ticket_id, subject) if priority: priorityset = " priority: %s\n" % priority if queue: queueset = " queue: %s\n" % queue if requestor: requestorset = " requestor: %s\n" % requestor if owner: from invenio.modules.accounts.models import User user = User.query.filter_by(nickname=owner).first() if user: ownerprefs = invenio.legacy.webuser.get_user_preferences( user.id) if "bibcatalog_username" in ownerprefs: owner = ownerprefs["bibcatalog_username"] ownerset = " owner: %s\n" % owner textset += ownerset + requestorset + recidset + queueset + priorityset + '\n' textset += text + '\n' ok = send_email(fromaddr=FROM_ADDRESS, toaddr=TO_ADDRESS, subject=subjectset, content=textset) if ok: return ticket_id return None
def __init__(self, recID, ln=CFG_SITE_LANG, search_pattern=None, xml_record=None, user_info=None, output_format=''): """ Creates a new bibformat object, with given record. You can either specify an record ID to format, or give its xml representation. if 'xml_record' is not None, use 'xml_record' instead of recID for the record. 'user_info' allows to grant access to some functionalities on a page depending on the user's priviledges. It is a dictionary in the following form:: user_info = { 'remote_ip' : '', 'remote_host' : '', 'referer' : '', 'uri' : '', 'agent' : '', 'uid' : -1, 'nickname' : '', 'email' : '', 'group' : [], 'guest' : '1' } :param recID: the id of a record :param ln: the language in which the record has to be formatted :param search_pattern: list of string representing the request used by the user in web interface :param xml_record: a xml string of the record to format :param user_info: the information of the user who will view the formatted page :param output_format: the output_format used for formatting this record """ self.xml_record = None # *Must* remain empty if recid is given if xml_record is not None: # If record is given as parameter self.xml_record = xml_record self.record = create_record(xml_record)[0] recID = record_get_field_value(self.record, "001") or None recID = int(recID) if recID is not None else recID try: assert isinstance(recID, (int, long, type(None))), 'Argument of wrong type!' except AssertionError: register_exception(prefix="recid needs to be an integer in BibFormatObject", alert_admin=True) recID = int(recID) self.recID = recID self.lang = wash_language(ln) if search_pattern is None: search_pattern = [] self.search_pattern = search_pattern self.output_format = output_format self.user_info = user_info if self.user_info is None: from invenio.ext.login.legacy_user import UserInfo self.user_info = UserInfo(None)
def log(msg): """Logs the given message in the alert engine log.""" try: logfile = open(CFG_LOGDIR + '/alertengine.log', 'a') logfile.write(strftime('%Y%m%d%H%M%S#')) logfile.write(msg + '\n') logfile.close() except Exception: register_exception()
def solr_commit(): try: # Commits might cause an exception, most likely a # timeout while hitting a background merge # Changes will then be committed later by the # calling (periodical) task # Also, autocommits can be used in the solrconfig SOLR_CONNECTION.commit() except: register_exception(alert_admin=True)
def oai_harvest_get(prefix, baseurl, harvestpath, fro=None, until=None, setspecs=None, user=None, password=None, cert_file=None, key_file=None, method="POST", verb="ListRecords", identifier=""): """ Retrieve OAI records from given repository, with given arguments :param prefix: :param baseurl: :param harvestpath: :param fro: :param until: :param setspecs: :param user: :param password: :param cert_file: :param key_file: :param method: :param verb: :param identifier: """ try: (addressing_scheme, network_location, path, dummy1, dummy2, dummy3) = urlparse.urlparse(baseurl) secure = (addressing_scheme == "https") http_param_dict = {'verb': verb, 'metadataPrefix': prefix} if identifier: http_param_dict['identifier'] = identifier if fro: http_param_dict['from'] = fro if until: http_param_dict['until'] = until sets = None if setspecs: sets = [oai_set.strip() for oai_set in setspecs.split(' ')] harvested_files = getter.harvest(network_location, path, http_param_dict, method, harvestpath, sets, secure, user, password, cert_file, key_file) return harvested_files except (StandardError, getter.InvenioOAIRequestError) as exce: register_exception() raise Exception("An error occurred while harvesting from %s: %s\n" % (baseurl, str(exce)))
def resolve_doi(req, doi, ln=CFG_SITE_LANG, verbose=0): """ Redirect to given DOI, or display error page when DOI cannot be resolved. """ _ = gettext_set_language(ln) # Fetch user ID: try: uid = getUid(req) except Error: register_exception(req=req, alert_admin=True) return page(title=_("Internal Error"), body=create_error_box(req, verbose=verbose, ln=ln), description="%s - Internal Error" % CFG_SITE_NAME, keywords="%s, Internal Error" % CFG_SITE_NAME, language=ln, req=req, navmenuid='search') # Resolve DOI recids = perform_request_search(p='doi:"%s"' % doi, of="id", verbose=verbose) recids = [recid for recid in recids if doi.lower() in \ [doi.lower() for doi in get_record(recid).get('doi', '') if doi]] # Answer if len(recids) == 1: # Found unique matching record return redirect_to_url(req, CFG_SITE_URL + '/' + CFG_SITE_RECORD + '/' + str(recids[0])) elif len(recids) == 0: # No corresponding record found page_body = '<p>' + (_("Sorry, DOI %(x_doi)s could not be resolved.", x_doi=('<strong>' + str(doi) + '</strong>'))) + '</p>' if req.header_only: raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND return page(title=_('DOI "%(x_doi)s" Not Found', x_doi=cgi.escape(doi)), body=page_body, description=(CFG_SITE_NAME + ' - ' + _("Not found") + ': ' + cgi.escape(str(doi))), keywords="%s" % CFG_SITE_NAME, uid=uid, language=ln, req=req, navmenuid='search') else: # Found multiple matching records try: raise Exception('DOI "%s" matched multiple records (%s) -- Please check' % (doi, ', '.join([str(recid) for recid in recids]))) except Exception, e: register_exception(req=req, alert_admin=True) page_body = websearch_templates.tmpl_multiple_dois_found_page(doi, recids, ln) return page(title=_('Found multiple records matching DOI %(x_doi)s', x_doi=cgi.escape(doi)), body=page_body, description=(CFG_SITE_NAME + ' - ' + _("Found multiple records matching DOI") + ': ' + cgi.escape(str(doi))), keywords="%s" % CFG_SITE_NAME, uid=uid, language=ln, req=req, navmenuid='search')
def perform_request_delete_group(uid, grpID, confirmed=0, ln=CFG_SITE_LANG): """First display confirm message(confirmed=0). then(confirmed=1) delete group and all its members @param uid: user ID @param grpID: ID of the group @param confirmed: =1 if confirmed message has been previously displayed @param ln: language @return: body with warnings """ body = "" warnings = [] infos = [] _ = gettext_set_language(ln) group_infos = db.get_group_infos(grpID) user_status = db.get_user_status(uid, grpID) if not group_infos: try: raise InvenioWebSessionWarning(_('The group has already been deleted.')) except InvenioWebSessionWarning as exc: register_exception(stream='warning') warnings.append(exc.message) body = perform_request_groups_display(uid, infos=infos, warnings=warnings, ln=CFG_SITE_LANG) else: if not len(user_status): try: raise InvenioWebSessionError(_('Sorry, there was an error with the database.')) except InvenioWebSessionError as exc: register_exception() body = websession_templates.tmpl_error(exc.message, ln) return body elif confirmed: group_infos = db.get_group_infos(grpID) group_name = group_infos[0][1] msg_subjet, msg_body = websession_templates.tmpl_delete_msg( group_name=group_name, ln=ln) (body, dummy, dummy) = perform_request_send( uid, msg_to_user="", msg_to_group=group_name, msg_subject=msg_subjet, msg_body=msg_body, ln=ln) db.delete_group_and_members(grpID) infos.append(CFG_WEBSESSION_INFO_MESSAGES["GROUP_DELETED"]) body = perform_request_groups_display(uid, infos=infos, warnings=warnings, ln=CFG_SITE_LANG) else: body = websession_templates.tmpl_confirm_delete(grpID, ln) return body
def get_uid_from_email(email): """Return the uid corresponding to an email. Return -1 when the email does not exists.""" try: res = run_sql("SELECT id FROM user WHERE email=%s", (email, )) if res: return res[0][0] else: return -1 except OperationalError: register_exception() return -1
def __call__(self, environ, start_response): """Wrapper for legacy calls.""" with self.app.request_context(environ): g.start_response = start_response try: response = self.app.full_dispatch_request() except Exception as e: from invenio.ext.logging import register_exception register_exception(req=request, alert_admin=True) response = self.app.handle_exception(e) return response(environ, start_response)
def extract_text(self, version=None, perform_ocr=False, ln='en'): """ Try what is necessary to extract the textual information of a document. @param version: the version of the document for which text is required. If not specified the text will be retrieved from the last version. @type version: integer @param perform_ocr: whether to perform OCR. @type perform_ocr: bool @param ln: a two letter language code to give as a hint to the OCR procedure. @type ln: string @raise InvenioBibDocFileError: in case of error. @note: the text is extracted and cached for later use. Use L{get_text} to retrieve it. """ from invenio.legacy.websubmit.file_converter import get_best_format_to_extract_text_from, convert_file, InvenioWebSubmitFileConverterError if version is None: version = self.get_latest_version() docfiles = self.list_version_files(version) ## We try to extract text only from original or OCRed documents. filenames = [ docfile.get_full_path() for docfile in docfiles if 'CONVERTED' not in docfile.flags or 'OCRED' in docfile.flags ] try: filename = get_best_format_to_extract_text_from(filenames) except InvenioWebSubmitFileConverterError: ## We fall back on considering all the documents filenames = [docfile.get_full_path() for docfile in docfiles] try: filename = get_best_format_to_extract_text_from(filenames) except InvenioWebSubmitFileConverterError: open(os.path.join(self.basedir, '.text;%i' % version), 'w').write('') return try: convert_file(filename, os.path.join(self.basedir, '.text;%i' % version), '.txt', perform_ocr=perform_ocr, ln=ln) if version == self.get_latest_version(): run_sql( "UPDATE bibdoc SET text_extraction_date=NOW() WHERE id=%s", (self.id, )) except InvenioWebSubmitFileConverterError as e: register_exception( alert_admin=True, prefix="Error in extracting text from bibdoc %i, version %i" % (self.id, version)) raise InvenioBibDocFileError, str(e)