def test_insert_via_curl(self): """batchuploader - robotupload insert via CLI curl""" if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK: if CFG_HAS_CURL: curl_input_file = os.path.join(cfg['CFG_TMPDIR'], 'curl_test.xml') open(curl_input_file, "w").write(self.marcxml) try: result = run_shell_command( '/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"', [ curl_input_file, self.nonce_url, make_user_agent_string('BatchUploader') ])[1] self.failUnless("[INFO]" in result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)]) results = json.loads( open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results['results']), 1) self.assertEqual(results['nonce'], "1234") self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless( """<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml']) finally: os.remove(curl_input_file)
def get_plots(tarball): """Return a list of found and converted plots given a tarball.""" sub_dir, dummy = get_defaults(tarball, cfg["CFG_TMPDIR"], "") tex_files = None image_list = None dummy, image_list, tex_files = untar(tarball, sub_dir) converted_image_list = convert_images(image_list) extracted_image_data = [] if tex_files == [] or tex_files is None: # Its not a tarball run_shell_command("rm -r %s", (sub_dir,)) else: for tex_file in tex_files: # Extract images, captions and labels partly_extracted_image_data = extract_captions(tex_file, sub_dir, converted_image_list) if partly_extracted_image_data: # Add proper filepaths and do various cleaning cleaned_image_data = prepare_image_data(partly_extracted_image_data, tex_file, converted_image_list) # Using prev. extracted info, get contexts for each # image found extracted_image_data.extend((extract_context(tex_file, cleaned_image_data))) return extracted_image_data
def get_plots(tarball): """Return a list of found and converted plots given a tarball.""" sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "") tex_files = None image_list = None dummy, image_list, tex_files = untar(tarball, sub_dir) converted_image_list = convert_images(image_list) extracted_image_data = [] if tex_files == [] or tex_files is None: # Its not a tarball run_shell_command('rm -r %s', (sub_dir, )) else: for tex_file in tex_files: # Extract images, captions and labels partly_extracted_image_data = extract_captions( tex_file, sub_dir, converted_image_list) if partly_extracted_image_data: # Add proper filepaths and do various cleaning cleaned_image_data = prepare_image_data( partly_extracted_image_data, tex_file, converted_image_list) # Using prev. extracted info, get contexts for each # image found extracted_image_data.extend( (extract_context(tex_file, cleaned_image_data))) return extracted_image_data
def generate_preview(f): """Generate PNG previews of PDF pages.""" directory = os.path.join(current_app.instance_path, "previews") try: os.mkdir(directory) except OSError: # directory already exists as per docs pass directory = os.path.join(directory, str(f.get_recid())) try: os.mkdir(directory) except OSError: # directory already exists as per docs, preview exists return directory cmd_pdftk = "pdftk %s burst output %s/pg_%s.pdf" (exit_status, output_std, output_err) = \ run_shell_command(cmd_pdftk, args=(get_pdf_path(f), directory, '%d')) cmd_pdftk = '%s -flatten -density 300 %s %s/`basename %s .pdf`.png' for fl in os.listdir(directory): if fl.endswith(".pdf"): fn = safe_join(directory, fl) (exit_status, output_std, output_err) = \ run_shell_command(cmd_pdftk, args=( str(cfg["CFG_PATH_CONVERT"]), fn, directory, fn))
def check_system(self, uid=None): """return an error string if there are problems""" if uid: rtuid, rtpw = get_bibcat_from_prefs(uid) else: # Assume default RT user rtuid = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER rtpw = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD if not rtuid and not rtpw: return "No valid RT user login specified" if not CFG_BIBCATALOG_SYSTEM == "RT": return "CFG_BIBCATALOG_SYSTEM is not RT though this is an RT module" if not CFG_BIBCATALOG_SYSTEM_RT_CLI: return "CFG_BIBCATALOG_SYSTEM_RT_CLI not defined or empty" if not os.path.exists(CFG_BIBCATALOG_SYSTEM_RT_CLI): return "CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " file does not exists" # Check that you can execute the binary.. this is a safe call unless someone can fake CFG_BIBCATALOG_SYSTEM_RT_CLI (unlikely) dummy, myout, myerr = run_shell_command(CFG_BIBCATALOG_SYSTEM_RT_CLI + " help") helpfound = False if myerr.count("help") > 0: helpfound = True if not helpfound: return ( "Execution of CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " help did not produce output 'help'" ) if not CFG_BIBCATALOG_SYSTEM_RT_URL: return "CFG_BIBCATALOG_SYSTEM_RT_URL not defined or empty" # Construct URL, split RT_URL at // if not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith("http://") and not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith( "https://" ): return "CFG_BIBCATALOG__SYSTEM_RT_URL does not start with 'http://' or 'https://'" httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//") # Assemble by http://user:password@RT_URL bibcatalog_rt_server = httppart + "//" + rtuid + ":" + rtpw + "@" + siteandpath # set as env var os.environ["RTUSER"] = rtuid os.environ["RTSERVER"] = bibcatalog_rt_server # try to talk to RT server # this is a safe call since rtpw is the only variable in it, and it is escaped rtpw = escape_shell_arg(rtpw) dummy, myout, myerr = run_shell_command( "echo " + rtpw + " | " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " ls \"Subject like 'F00'\"" ) if len(myerr) > 0: return "could not connect to " + bibcatalog_rt_server + " " + myerr # finally, check that there is some sane output like tickets or 'No matching result' saneoutput = (myout.count("matching") > 0) or (myout.count("1") > 0) if not saneoutput: return CFG_BIBCATALOG_SYSTEM_RT_CLI + " returned " + myout + " instead of 'matching' or '1'" return ""
def check_system(self, uid=None): """return an error string if there are problems""" if uid: rtuid, rtpw = get_bibcat_from_prefs(uid) else: # Assume default RT user rtuid = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER rtpw = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD if not rtuid and not rtpw: return "No valid RT user login specified" if not CFG_BIBCATALOG_SYSTEM == 'RT': return "CFG_BIBCATALOG_SYSTEM is not RT though this is an RT module" if not CFG_BIBCATALOG_SYSTEM_RT_CLI: return "CFG_BIBCATALOG_SYSTEM_RT_CLI not defined or empty" if not os.path.exists(CFG_BIBCATALOG_SYSTEM_RT_CLI): return "CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " file does not exists" # Check that you can execute the binary.. this is a safe call unless someone can fake CFG_BIBCATALOG_SYSTEM_RT_CLI (unlikely) dummy, myout, myerr = run_shell_command(CFG_BIBCATALOG_SYSTEM_RT_CLI + " help") helpfound = False if myerr.count("help") > 0: helpfound = True if not helpfound: return "Execution of CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " help did not produce output 'help'" if not CFG_BIBCATALOG_SYSTEM_RT_URL: return "CFG_BIBCATALOG_SYSTEM_RT_URL not defined or empty" # Construct URL, split RT_URL at // if not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('http://') and \ not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('https://'): return "CFG_BIBCATALOG__SYSTEM_RT_URL does not start with 'http://' or 'https://'" httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//") # Assemble by http://user:password@RT_URL bibcatalog_rt_server = httppart + "//" + rtuid + ":" + rtpw + "@" + siteandpath #set as env var os.environ["RTUSER"] = rtuid os.environ["RTSERVER"] = bibcatalog_rt_server #try to talk to RT server #this is a safe call since rtpw is the only variable in it, and it is escaped rtpw = escape_shell_arg(rtpw) dummy, myout, myerr = run_shell_command("echo " + rtpw + " | " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " ls \"Subject like 'F00'\"") if len(myerr) > 0: return "could not connect to " + bibcatalog_rt_server + " " + myerr #finally, check that there is some sane output like tickets or 'No matching result' saneoutput = (myout.count('matching') > 0) or (myout.count('1') > 0) if not saneoutput: return CFG_BIBCATALOG_SYSTEM_RT_CLI + " returned " + myout + " instead of 'matching' or '1'" return ""
def test_simple_insert(self): """batchuploader - robotupload simple insert""" from invenio.legacy.search_engine import get_record result = urllib2.urlopen(self.req).read() self.failUnless("[INFO]" in result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) current_recid = run_sql("SELECT MAX(id) FROM bibrec")[0][0] self.failIfEqual(self.last_recid, current_recid) record = get_record(current_recid) self.assertEqual(record['245'][0][0], [('a', 'The title')])
def test_insert_with_callback(self): """batchuploader - robotupload insert with callback""" result = urllib2.urlopen(self.req_callback).read() self.failUnless("[INFO]" in result, '"%s" did not contained [INFO]' % result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless('results' in results) self.assertEqual(len(results['results']), 1) self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
def test_simple_insert(self): """batchuploader - robotupload simple insert""" if CFG_LOCALHOST_OK: from invenio.legacy.search_engine import get_record result = urllib2.urlopen(self.req).read() self.failUnless("[INFO]" in result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)]) current_recid = run_sql("SELECT MAX(id) FROM bibrec")[0][0] self.failIfEqual(self.last_recid, current_recid) record = get_record(current_recid) self.assertEqual(record['245'][0][0], [('a', 'The title')])
def clean_up(extracted_files_list, image_list): """ Removes all the intermediate stuff. @param: extracted_files_list ([string, string, ...]): list of all extracted files @param: image_list ([string, string, ...]): list of the images to keep """ for extracted_file in extracted_files_list: # Remove everything that is not in the image_list or is not a directory if extracted_file not in image_list and extracted_file[-1] != os.sep: run_shell_command('rm %s', (extracted_file,))
def test_insert_with_callback(self): """batchuploader - robotupload insert with callback""" if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK: result = urllib2.urlopen(self.req_callback).read() self.failUnless("[INFO]" in result, '"%s" did not contained [INFO]' % result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless('results' in results) self.assertEqual(len(results['results']), 1) self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
def test_get_defaults(self): """plotextractor - get defaults""" from invenio.utils.shell import run_shell_command from invenio.utils.plotextractor.cli import get_defaults sdir_should_be = os.path.join(cfg['CFG_TMPDIR'], self.arXiv_id + '_plots') refno_should_be = "15" # Note: For ATLANTIS DEMO site sdir, refno = get_defaults(tarball=self.tarball, sdir=None, refno_url=cfg['CFG_SITE_URL']) if sdir != None: run_shell_command("rm -rf %s" % (sdir,)) self.assertTrue(sdir == sdir_should_be, \ "didn\'t get correct default scratch dir") self.assertTrue(refno == refno_should_be, \ 'didn\'t get correct default reference number')
def test_insert_with_nonce(self): """batchuploader - robotupload insert with nonce""" if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK: result = urllib2.urlopen(self.req_nonce).read() self.failUnless("[INFO]" in result, '"%s" did not contained "[INFO]"' % result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained "results" key' % results) self.assertEqual(len(results['results']), 1) self.assertEqual(results['nonce'], "1234") self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
def test_insert_with_oracle(self): """batchuploader - robotupload insert with oracle special treatment""" import os if os.path.exists('/opt/invenio/var/log/invenio.err'): os.remove('/opt/invenio/var/log/invenio.err') result = urllib2.urlopen(self.req_oracle).read() self.failUnless("[INFO]" in result, '"%s" did not contained "[INFO]"' % result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained "results" key' % results) self.assertEqual(len(results['results']), 1) self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
def filter_step(obj, eng): """Run an external python script.""" from invenio_records.api import Record from invenio.utils.shell import run_shell_command repository = obj.extra_data.get("repository", {}) arguments = repository.get("arguments", {}) script_name = arguments.get("f_filter-file") if script_name: marcxml_value = Record(obj.data.dumps()).legacy_export_as_marc() extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) if not os.path.exists(extract_path): os.makedirs(extract_path) # Now we launch BibUpload tasks for the final MARCXML files marcxmlfile = extract_path + os.sep + str(obj.id) file_fd = open(marcxmlfile, 'w') file_fd.write(marcxml_value) file_fd.close() exitcode, cmd_stdout, cmd_stderr = run_shell_command( cmd="%s '%s'", args=(str(script_name), str(marcxmlfile))) if exitcode != 0 or cmd_stderr != "": obj.log.error( "Error while running filtering script on %s\nError:%s" % (marcxmlfile, cmd_stderr)) else: obj.log.info(cmd_stdout) else: obj.log.error("No script file found!")
def test_run_cmd_hello_quote(self): """shellutils - running simple command with an argument with quote""" self.assertEqual((0, "hel'lo world\n", ''), run_shell_command("echo %s %s", ( "hel'lo", "world", )))
def test_insert_with_oracle(self): """batchuploader - robotupload insert with oracle special treatment""" if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK: import os if os.path.exists('/opt/invenio/var/log/invenio.err'): os.remove('/opt/invenio/var/log/invenio.err') result = urllib2.urlopen(self.req_oracle).read() self.failUnless("[INFO]" in result, '"%s" did not contained "[INFO]"' % result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained "results" key' % results) self.assertEqual(len(results['results']), 1) self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
def _run_rt_command(self, command, uid=None): """ This function will run a RT CLI command as given user. If no user is specified the default RT user will be used, if configured. Should any of the configuration parameters be missing this function will return None. Otherwise it will return the standard output from the CLI command. @param command: RT CLI command to execute @type command: string @param uid: the Invenio user id to submit on behalf of. Optional. @type uid: int @return: standard output from the command given. None, if any errors. @rtype: string """ if not CFG_BIBCATALOG_SYSTEM_RT_URL: return None if uid: username, passwd = get_bibcat_from_prefs(uid) else: username = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER passwd = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//") bibcatalog_rt_server = httppart + "//" + username + ":" + passwd + "@" + siteandpath # set as env var os.environ["RTUSER"] = username os.environ["RTSERVER"] = bibcatalog_rt_server passwd = escape_shell_arg(passwd) error_code, myout, error_output = run_shell_command("echo " + passwd + " | " + command) if error_code > 0: raise ValueError('Problem running "%s": %d - %s' % (command, error_code, error_output)) return myout
def test_get_defaults(self): """plotextractor - get defaults""" from invenio.utils.shell import run_shell_command from invenio.utils.plotextractor.cli import get_defaults sdir_should_be = os.path.join(cfg['CFG_TMPDIR'], self.arXiv_id + '_plots') refno_should_be = "15" # Note: For ATLANTIS DEMO site sdir, refno = get_defaults(tarball=self.tarball, sdir=None, refno_url=cfg['CFG_SITE_URL']) if sdir != None: run_shell_command("rm -rf %s" % (sdir, )) self.assertTrue(sdir == sdir_should_be, \ "didn\'t get correct default scratch dir") self.assertTrue(refno == refno_should_be, \ 'didn\'t get correct default reference number')
def get_list_of_all_matching_files(basedir, filetypes): """ This function uses the os module in order tocrawl through the directory tree rooted at basedir and find all the files therein that include filetype in their 'file' output. Returns a list of absolute paths to all files. @param: basedir (string): the directory where we want to start crawling @param: filetypes ([string, string]): something that will be contained in the output of running 'file' on the types of files we're looking for @return: file_paths ([string, string, ...]): a list of full paths to the files that we discovered """ file_paths = [] for dirpath, dummy0, filenames in os.walk(basedir): for filename in filenames: full_path = os.path.join(dirpath, filename) dummy1, cmd_out, dummy2 = run_shell_command( 'file %s', (full_path, )) for filetype in filetypes: if cmd_out.find(filetype) > -1: file_paths.append(full_path) return file_paths
def find_matching_files(basedir, filetypes): """Try to find all files matching given filetypes. By looking at all the files and filenames in the given directory, including subdirectories. :param basedir: full path to base directory to search in :type basedir: string :param filetypes: list of filetypes, extensions :type filetypes: list :return: exitcode and any error messages as: (exitcode, err_msg) :rtype: tuple """ files_list = [] for dirpath, dummy0, filenames in os.walk(basedir): for filename in filenames: full_path = os.path.join(dirpath, filename) dummy1, cmd_out, dummy2 = run_shell_command( 'file %s', (full_path, )) for filetype in filetypes: if cmd_out.lower().find(filetype) > -1: files_list.append(full_path) elif filename.split('.')[-1].lower() == filetype: files_list.append(full_path) return files_list
def _dump_database(dirname, filename): """ Dump Invenio database into SQL file called FILENAME living in DIRNAME. """ write_message("... writing %s" % dirname + os.sep + filename) cmd = CFG_PATH_MYSQL + 'dump' if not os.path.exists(cmd): msg = "ERROR: cannot find %s." % cmd write_message(msg, stream=sys.stderr) raise StandardError(msg) cmd += " --skip-opt --add-drop-table --add-locks --create-options " \ " --quick --extended-insert --set-charset --disable-keys " \ " --host=%s --user=%s --password=%s %s | %s -c " % \ (escape_shell_arg(CFG_DATABASE_HOST), escape_shell_arg(CFG_DATABASE_USER), escape_shell_arg(CFG_DATABASE_PASS), escape_shell_arg(CFG_DATABASE_NAME), CFG_PATH_GZIP) dummy1, dummy2, dummy3 = run_shell_command(cmd, None, dirname + os.sep + filename) if dummy1: msg = "ERROR: mysqldump exit code is %s." % repr(dummy1) write_message(msg, stream=sys.stderr) raise StandardError(msg) if dummy2: msg = "ERROR: mysqldump stdout is %s." % repr(dummy1) write_message(msg, stream=sys.stderr) raise StandardError(msg) if dummy3: msg = "ERROR: mysqldump stderr is %s." % repr(dummy1) write_message(msg, stream=sys.stderr) raise StandardError(msg)
def find_matching_files(basedir, filetypes): """Try to find all files matching given filetypes. By looking at all the files and filenames in the given directory, including subdirectories. :param basedir: full path to base directory to search in :type basedir: string :param filetypes: list of filetypes, extensions :type filetypes: list :return: exitcode and any error messages as: (exitcode, err_msg) :rtype: tuple """ files_list = [] for dirpath, dummy0, filenames in os.walk(basedir): for filename in filenames: full_path = os.path.join(dirpath, filename) dummy1, cmd_out, dummy2 = run_shell_command( 'file %s', (full_path,) ) for filetype in filetypes: if cmd_out.lower().find(filetype) > -1: files_list.append(full_path) elif filename.split('.')[-1].lower() == filetype: files_list.append(full_path) return files_list
def get_list_of_all_matching_files(basedir, filetypes): """ This function uses the os module in order tocrawl through the directory tree rooted at basedir and find all the files therein that include filetype in their 'file' output. Returns a list of absolute paths to all files. @param: basedir (string): the directory where we want to start crawling @param: filetypes ([string, string]): something that will be contained in the output of running 'file' on the types of files we're looking for @return: file_paths ([string, string, ...]): a list of full paths to the files that we discovered """ file_paths = [] for dirpath, dummy0, filenames in os.walk(basedir): for filename in filenames: full_path = os.path.join(dirpath, filename) dummy1, cmd_out, dummy2 = run_shell_command('file %s', (full_path,)) for filetype in filetypes: if cmd_out.find(filetype) > -1: file_paths.append(full_path) return file_paths
def test_insert_via_curl(self): """batchuploader - robotupload insert via CLI curl""" curl_input_file = os.path.join(CFG_TMPDIR, 'curl_test.xml') open(curl_input_file, "w").write(self.marcxml) try: result = run_shell_command('/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"', [curl_input_file, self.nonce_url, make_user_agent_string('BatchUploader')])[1] self.failUnless("[INFO]" in result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results['results']), 1) self.assertEqual(results['nonce'], "1234") self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml']) finally: os.remove(curl_input_file)
def _list_quota(cls, directory): if not os.path.isdir(directory): return None (ret, output, err) = run_shell_command( "fs listquota {}".format(directory)) if ret == 0: return cls._parse_output(output) return None
def _list_quota(cls, directory): if not os.path.isdir(directory): return None (ret, output, err) = run_shell_command("fs listquota {}".format(directory)) if ret == 0: return cls._parse_output(output) return None
def maxpage(f): """Returns the number of pages for PDF records via AJAX""" cmd_pdftk = '%s %s dump_data output | grep NumberOfPages' pdf = get_pdf_path(f) if pdf is not None: (exit_status, output_std, output_err) = \ run_shell_command(cmd_pdftk, args=(str(cfg['CFG_PATH_PDFTK']), pdf)) if int(exit_status) == 0 and len(output_err) == 0: return jsonify(maxpage=int(output_std.strip().split(' ')[1])) return jsonify(maxpage=-1)
def test_legacy_insert_via_curl(self): """batchuploader - robotupload legacy insert via CLI curl""" curl_input_file = os.path.join(CFG_TMPDIR, 'curl_test.xml') open(curl_input_file, "w").write(self.marcxml) try: ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload code, result, err = run_shell_command("/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s", [curl_input_file, self.callback_url, self.legacy_url, make_user_agent_string('BatchUploader')]) self.failUnless("[INFO]" in result, '[INFO] not find in results: %s, %s' % (result, err)) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results['results']), 1) self.assertEqual(results['nonce'], "1234") self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml']) finally: os.remove(curl_input_file)
def maxpage(f): """Return number of pages for PDF records via AJAX.""" cmd_pdftk = "%s %s dump_data output | grep NumberOfPages" pdf = get_pdf_path(f) if pdf is not None: (exit_status, output_std, output_err) = \ run_shell_command(cmd_pdftk, args=(str(cfg["CFG_PATH_PDFTK"]), pdf)) if int(exit_status) == 0 and len(output_err) == 0: return jsonify(maxpage=int(output_std.strip().split(" ")[1])) return jsonify(maxpage=-1)
def test_run_cmd_timeout_no_zombie(self): """shellutils - running simple command no zombie""" self.assertRaises(Timeout, run_process_with_timeout, (self.script_path, '15', "THISISATEST"), timeout=5) ps_output = run_shell_command('ps aux')[1] self.failIf('THISISATEST' in ps_output, '"THISISATEST" was found in %s' % ps_output) self.failIf('sleep 15' in ps_output, '"sleep 15" was found in %s' % ps_output)
def test_legacy_insert_via_curl(self): """batchuploader - robotupload legacy insert via CLI curl""" if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK: if CFG_HAS_CURL: curl_input_file = os.path.join(cfg['CFG_TMPDIR'], 'curl_test.xml') open(curl_input_file, "w").write(self.marcxml) try: ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload code, result, err = run_shell_command("/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s", [curl_input_file, self.callback_url, self.legacy_url, make_user_agent_string('BatchUploader')]) self.failUnless("[INFO]" in result, '[INFO] not find in results: %s, %s' % (result, err)) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results['results']), 1) self.assertEqual(results['nonce'], "1234") self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml']) finally: os.remove(curl_input_file)
def load_submission(doctype, dump, method=None): "Insert submission into database. Return tuple(error code, msg)" # NOT TESTED messages = [] def guess_dump_method(dump): """Guess which method was used to dump this file (i.e. if it contains all the submission rows or not)""" match_obj = re_method_pattern.search(dump) if match_obj: return match_obj.group('method') else: return None def guess_dump_has_delete_statements(dump): """Guess if given submission dump already contain delete statements""" return "DELETE FROM sbmDOCTYPE WHERE sdocname".lower() in dump.lower() if not method: method = guess_dump_method(dump) if method is None: method = CFG_WEBSUBMIT_DUMPER_DEFAULT_METHOD messages.append( "WARNING: method could not be guessed. Using method %s" % method) else: messages.append("Used method %s to load data" % method) (dump_code, dump_path) = tempfile.mkstemp(prefix=doctype, dir=CFG_TMPDIR) dump_fd = open(dump_path, 'w') dump_fd.write(dump) dump_fd.close() # We need to remove the submission. But let's create a backup first. submission_backup = dump_submission(doctype, method) submission_backup_path = "%s_db_dump%s.sql" % ( doctype, time.strftime("%Y%m%d_%H%M%S")) fd = file(os.path.join(CFG_TMPDIR, submission_backup_path), "w") fd.write(submission_backup) fd.close() if not guess_dump_has_delete_statements(dump): remove_submission(doctype, method) # Load the dump (exit_code, out_msg, err_msg) = run_shell_command("%s/bin/dbexec < %s", (CFG_PREFIX, os.path.abspath(dump_path))) if exit_code: messages.append("ERROR: failed to load submission:" + err_msg) return (1, messages) messages.append("Submission loaded. Previous submission saved to %s" % os.path.join(CFG_TMPDIR, submission_backup_path)) return (0, messages)
def get_text_snippets(textfile_path, patterns, nb_chars, max_snippets): """ Extract text snippets around 'patterns' from the file found at 'textfile_path'. The snippets are meant to look similar to results of popular Internet search engines: using " ... " between snippets. For empty patterns it returns "" """ # TODO: - distinguish the beginning of sentences and make the snippets # start there # - optimize finding patterns - first search for patterns apperaing next # to each other, secondly look for each patten not for first # occurances of any pattern if len(patterns) == 0: return "" max_lines = nb_chars / 40 + 2 # rule of thumb in order to catch nb_chars # Produce the big snippets from which the real snippets will be cut out cmd = "grep -i -C%s -m%s" cmdargs = [str(max_lines), str(max_snippets)] for p in patterns: cmd += " -e %s" cmdargs.append(" " + p) cmd += " %s" cmdargs.append(textfile_path) (dummy1, output, dummy2) = run_shell_command(cmd, cmdargs) # a fact to keep in mind with this call to grep is that if patterns appear # in two contigious lines, they will not be separated by '--' and therefore # treated as one 'big snippet' result = [] big_snippets = output.split("--") # cut the snippets to match the nb_words_around parameter precisely: for s in big_snippets: small_snippet = cut_out_snippet(s, patterns, nb_chars) result.append(small_snippet) # combine snippets out = "" count = 0 for snippet in result: if snippet and count < max_snippets: if out: out += "..." out += highlight(snippet, patterns, whole_word_matches=True) return out
def _grep_version_from_executable(path_to_exec, version_regexp): """Try to detect a program version. Grep in its binary PATH_TO_EXEC and looking for VERSION_REGEXP. Return program version as a string. Return empty string if not succeeded. """ from invenio.utils.shell import run_shell_command exec_version = "" if os.path.exists(path_to_exec): dummy1, cmd2_out, dummy2 = run_shell_command( "strings %s | grep %s", (path_to_exec, version_regexp)) if cmd2_out: for cmd2_out_line in cmd2_out.split("\n"): if len(cmd2_out_line) > len(exec_version): # the longest the better exec_version = cmd2_out_line return exec_version
def load_submission(doctype, dump, method=None): "Insert submission into database. Return tuple(error code, msg)" # NOT TESTED messages = [] def guess_dump_method(dump): """Guess which method was used to dump this file (i.e. if it contains all the submission rows or not)""" match_obj = re_method_pattern.search(dump) if match_obj: return match_obj.group('method') else: return None def guess_dump_has_delete_statements(dump): """Guess if given submission dump already contain delete statements""" return "DELETE FROM sbmDOCTYPE WHERE sdocname".lower() in dump.lower() if not method: method = guess_dump_method(dump) if method is None: method = CFG_WEBSUBMIT_DUMPER_DEFAULT_METHOD messages.append("WARNING: method could not be guessed. Using method %s" % method) else: messages.append("Used method %s to load data" % method) (dump_code, dump_path) = tempfile.mkstemp(prefix=doctype, dir=CFG_TMPDIR) dump_fd = open(dump_path, 'w') dump_fd.write(dump) dump_fd.close() # We need to remove the submission. But let's create a backup first. submission_backup = dump_submission(doctype, method) submission_backup_path = "%s_db_dump%s.sql" % (doctype, time.strftime("%Y%m%d_%H%M%S")) fd = file(os.path.join(CFG_TMPDIR, submission_backup_path), "w") fd.write(submission_backup) fd.close() if not guess_dump_has_delete_statements(dump): remove_submission(doctype, method) # Load the dump (exit_code, out_msg, err_msg) = run_shell_command("%s/bin/dbexec < %s", (CFG_PREFIX, os.path.abspath(dump_path))) if exit_code: messages.append("ERROR: failed to load submission:" + err_msg) return (1, messages) messages.append("Submission loaded. Previous submission saved to %s" % os.path.join(CFG_TMPDIR, submission_backup_path)) return (0, messages)
def version(separator='\n', formatting='{version} [{executable}]'): """ Try to detect Apache version by localizing httpd or apache executables and grepping inside binaries. Return list of all found Apache versions and paths. (For a given executable, the returned format is 'apache_version [apache_path]'.) Return empty list if no success. """ from invenio.utils.shell import run_shell_command out = [] dummy1, cmd_out, dummy2 = run_shell_command("locate bin/httpd bin/apache") for apache in cmd_out.split("\n"): apache_version = _grep_version_from_executable(apache, '^Apache\/') if apache_version: out.append( formatting.format(version=apache_version, executable=apache)) if separator is None: return out return separator.join(out)
def version(separator='\n', formatting='{version} [{executable}]'): """ Try to detect Apache version by localizing httpd or apache executables and grepping inside binaries. Return list of all found Apache versions and paths. (For a given executable, the returned format is 'apache_version [apache_path]'.) Return empty list if no success. """ from invenio.utils.shell import run_shell_command out = [] dummy1, cmd_out, dummy2 = run_shell_command("locate bin/httpd bin/apache") for apache in cmd_out.split("\n"): apache_version = _grep_version_from_executable(apache, '^Apache\/') if apache_version: out.append(formatting.format(version=apache_version, executable=apache)) if separator is None: return out return separator.join(out)
def convert_images(image_list): """ Here we figure out the types of the images that were extracted from the tarball and determine how to convert them into PNG. @param: image_list ([string, string, ...]): the list of image files extracted from the tarball in step 1 @return: image_list ([str, str, ...]): The list of image files when all have been converted to PNG format. """ png_output_contains = 'PNG image' ret_list = [] for image_file in image_list: if os.path.isdir(image_file): continue # FIXME: here and everywhere else in the plot extractor # library the run shell command statements should be (1) # called with timeout in order to prevent runaway imagemagick # conversions; (2) the arguments should be passed properly so # that they are escaped. dummy1, cmd_out, dummy2 = run_shell_command('file %s', (image_file,)) if cmd_out.find(png_output_contains) > -1: ret_list.append(image_file) else: # we're just going to assume that ImageMagick can convert all # the image types that we may be faced with # for sure it can do EPS->PNG and JPG->PNG and PS->PNG # and PSTEX->PNG converted_image_file = get_converted_image_name(image_file) cmd_list = ['convert', image_file, converted_image_file] try: dummy1, cmd_out, cmd_err = run_process_with_timeout(cmd_list) if cmd_err == '' or os.path.exists(converted_image_file): ret_list.append(converted_image_file) else: write_message('convert failed on ' + image_file) except Timeout: write_message('convert timed out on ' + image_file) return ret_list
def convert_images(image_list): """ Here we figure out the types of the images that were extracted from the tarball and determine how to convert them into PNG. @param: image_list ([string, string, ...]): the list of image files extracted from the tarball in step 1 @return: image_list ([str, str, ...]): The list of image files when all have been converted to PNG format. """ png_output_contains = 'PNG image' ret_list = [] for image_file in image_list: if os.path.isdir(image_file): continue # FIXME: here and everywhere else in the plot extractor # library the run shell command statements should be (1) # called with timeout in order to prevent runaway imagemagick # conversions; (2) the arguments should be passed properly so # that they are escaped. dummy1, cmd_out, dummy2 = run_shell_command('file %s', (image_file, )) if cmd_out.find(png_output_contains) > -1: ret_list.append(image_file) else: # we're just going to assume that ImageMagick can convert all # the image types that we may be faced with # for sure it can do EPS->PNG and JPG->PNG and PS->PNG # and PSTEX->PNG converted_image_file = get_converted_image_name(image_file) cmd_list = ['convert', image_file, converted_image_file] try: dummy1, cmd_out, cmd_err = run_process_with_timeout(cmd_list) if cmd_err == '': ret_list.append(converted_image_file) else: write_message('convert failed on ' + image_file) except Timeout: write_message('convert timed out on ' + image_file) return ret_list
def read_metadata_local(inputfile, verbose): """ Metadata extraction from many kind of files @param inputfile: path to the image @type inputfile: string @param verbose: verbosity @type verbose: int @rtype: dict @return: dictionary with metadata """ cmd = CFG_PATH_PDFTK + ' %s dump_data' (exit_status, output_std, output_err) = \ run_shell_command(cmd, args=(inputfile,)) metadata_dict = {} key = None value = None for metadata_line in output_std.splitlines(): if metadata_line.strip().startswith("InfoKey"): key = metadata_line.split(':', 1)[1].strip() elif metadata_line.strip().startswith("InfoValue"): value = metadata_line.split(':', 1)[1].strip() if key in ["ModDate", "CreationDate"]: # FIXME: Interpret these dates? try: pass #value = datetime.strptime(value, "D:%Y%m%d%H%M%S%Z") except: pass if key: metadata_dict[key] = value key = None else: try: custom_key, custom_value = metadata_line.split(':', 1) metadata_dict[custom_key.strip()] = custom_value.strip() except: # Most probably not relevant line pass return metadata_dict
def check_for_gzip(tfile): """ Was that tarball also gzipped? Let's find out! @param: file (string): the name of the object (so we can gunzip, if that's necessary) @output: a gunzipped file in the directory of choice, if that's necessary @return new_file (string): The name of the file after gunzipping or the original name of the file if that wasn't necessary """ gzip_contains = 'gzip compressed data' dummy1, cmd_out, dummy2 = run_shell_command('file %s', (tfile, )) if cmd_out.find(gzip_contains) > -1: # we have a gzip! # so gzip is retarded and won't accept any file that doesn't end # with .gz. sad. run_shell_command('cp %s %s', (tfile, tfile + '.tar.gz')) new_dest = os.path.join(os.path.split(tfile)[0], 'tmp.tar') run_shell_command('touch %s', (new_dest, )) dummy1, cmd_out, cmd_err = run_shell_command('gunzip -c %s', (tfile + '.tar.gz', )) if cmd_err != '': write_message('Error while gunzipping ' + tfile) return tfile tarfile = open(new_dest, 'w') tarfile.write(cmd_out) tarfile.close() run_shell_command('rm %s', (tfile + '.tar.gz', )) return new_dest return tfile
def check_for_gzip(tfile): """ Was that tarball also gzipped? Let's find out! @param: file (string): the name of the object (so we can gunzip, if that's necessary) @output: a gunzipped file in the directory of choice, if that's necessary @return new_file (string): The name of the file after gunzipping or the original name of the file if that wasn't necessary """ gzip_contains = 'gzip compressed data' dummy1, cmd_out, dummy2 = run_shell_command('file %s', (tfile,)) if cmd_out.find(gzip_contains) > -1: # we have a gzip! # so gzip is retarded and won't accept any file that doesn't end # with .gz. sad. run_shell_command('cp %s %s', (tfile, tfile + '.tar.gz')) new_dest = os.path.join(os.path.split(tfile)[0], 'tmp.tar') run_shell_command('touch %s', (new_dest,)) dummy1, cmd_out, cmd_err = run_shell_command('gunzip -c %s', (tfile + '.tar.gz',)) if cmd_err != '': write_message('Error while gunzipping ' + tfile) return tfile tarfile = open(new_dest, 'w') tarfile.write(cmd_out) tarfile.close() run_shell_command('rm %s', (tfile + '.tar.gz',)) return new_dest return tfile
def _find_and_run_js_test_files(): """Find and run all the JavaScript files. Find all JS files installed in Invenio lib directory and run them on the JsTestDriver server """ from invenio.utils.shell import run_shell_command errors_found = 0 for candidate in os.listdir(CFG_WEBDIR + "/js"): base, ext = os.path.splitext(candidate) if ext != '.js' or not base.endswith('_tests'): continue print("Found test file %s. Running tests... " % (base + ext)) exitcode_, stdout, stderr_ = run_shell_command( cmd="java -jar %s/JsTestDriver.jar --config %s --tests all" % (CFG_PREFIX + "/lib/java/js-test-driver", CFG_WEBDIR + "/js/" + base + '.conf')) print(stdout) if "Fails: 0" not in stdout: errors_found += 1 print(errors_found) return errors_found
def _run_rt_command(self, command, uid=None): """ This function will run a RT CLI command as given user. If no user is specified the default RT user will be used, if configured. Should any of the configuration parameters be missing this function will return None. Otherwise it will return the standard output from the CLI command. @param command: RT CLI command to execute @type command: string @param uid: the Invenio user id to submit on behalf of. Optional. @type uid: int @return: standard output from the command given. None, if any errors. @rtype: string """ if not CFG_BIBCATALOG_SYSTEM_RT_URL: return None if uid: username, passwd = get_bibcat_from_prefs(uid) else: username = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER passwd = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//") bibcatalog_rt_server = httppart + "//" + username + ":" + passwd + "@" + siteandpath #set as env var os.environ["RTUSER"] = username os.environ["RTSERVER"] = bibcatalog_rt_server passwd = escape_shell_arg(passwd) error_code, myout, error_output = run_shell_command("echo " + passwd + " | " + command) if error_code > 0: raise ValueError('Problem running "%s": %d - %s' % (command, error_code, error_output)) return myout
def filter_step(obj, eng): """Run an external python script.""" from invenio.modules.records.api import Record from invenio.utils.shell import run_shell_command repository = obj.extra_data.get("repository", {}) arguments = repository.get("arguments", {}) script_name = arguments.get("f_filter-file") if script_name: marcxml_value = Record(obj.data.dumps()).legacy_export_as_marc() extract_path = os.path.join( cfg['CFG_TMPSHAREDDIR'], str(eng.uuid) ) if not os.path.exists(extract_path): os.makedirs(extract_path) # Now we launch BibUpload tasks for the final MARCXML files marcxmlfile = extract_path + os.sep + str(obj.id) file_fd = open(marcxmlfile, 'w') file_fd.write(marcxml_value) file_fd.close() exitcode, cmd_stdout, cmd_stderr = run_shell_command( cmd="%s '%s'", args=(str(script_name), str(marcxmlfile))) if exitcode != 0 or cmd_stderr != "": obj.log.error( "Error while running filtering script on %s\nError:%s" % (marcxmlfile, cmd_stderr) ) else: obj.log.info(cmd_stdout) else: obj.log.error("No script file found!")
def _plot_extract(obj, eng): """ Performs the plotextraction step. """ # Download tarball for each harvested/converted record, then run plotextrator. # Update converted xml files with generated xml or add it for upload bibtask.task_sleep_now_if_required() if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if 'p_extraction-source' not in obj.extra_data["_repository"]["arguments"]: p_extraction_source = plotextractor_types else: p_extraction_source = obj.extra_data["_repository"]["arguments"]['p_extraction-source'] if not isinstance(p_extraction_source, list): p_extraction_source = [p_extraction_source] if 'latex' in p_extraction_source: # Run LaTeX plotextractor if "tarball" not in obj.extra_data["_result"]: # turn oaiharvest_23_1_20110214161632_converted -> oaiharvest_23_1_material # to let harvested material in same folder structure extract_path = plotextractor_getter.make_single_directory(cfg['CFG_TMPSHAREDDIR'], eng.uuid) tarball, pdf = plotextractor_getter.harvest_single(obj.data["system_number_external"]["value"], extract_path, ["tarball"]) tarball = str(tarball) if tarball is None: raise workflows_error.WorkflowError(str("Error harvesting tarball from id: %s %s" % (obj.data["system_number_external"]["value"], extract_path)), eng.uuid, id_object=obj.id) obj.extra_data["_result"]["tarball"] = tarball else: tarball = obj.extra_data["_result"]["tarball"] sub_dir, refno = get_defaults(tarball, cfg['CFG_TMPDIR'], "") tex_files = None image_list = None try: extracted_files_list, image_list, tex_files = untar(tarball, sub_dir) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % (tarball,)) converted_image_list = convert_images(image_list) eng.log.info('converted %d of %d images found for %s' % (len(converted_image_list), len(image_list), os.path.basename(tarball))) extracted_image_data = [] if tex_files == [] or tex_files is None: eng.log.error('%s is not a tarball' % (os.path.split(tarball)[-1],)) run_shell_command('rm -r %s', (sub_dir,)) else: for tex_file in tex_files: # Extract images, captions and labels partly_extracted_image_data = extract_captions(tex_file, sub_dir, converted_image_list) if partly_extracted_image_data: # Add proper filepaths and do various cleaning cleaned_image_data = prepare_image_data(partly_extracted_image_data, tex_file, converted_image_list) # Using prev. extracted info, get contexts for each image found extracted_image_data.extend((extract_context(tex_file, cleaned_image_data))) if extracted_image_data: extracted_image_data = remove_dups(extracted_image_data) create_contextfiles(extracted_image_data) marc_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' marc_xml += create_MARC(extracted_image_data, tarball, None) marc_xml += "\n</collection>" if marc_xml: # We store the path to the directory the tarball contents live # Read and grab MARCXML from plotextractor run new_dict_representation = records_api.create_record(marc_xml, master_format="marc").dumps() try: obj.data['fft'].append(new_dict_representation["fft"]) except KeyError: obj.data['fft'] = [new_dict_representation['fft']] obj.add_task_result("filesfft", new_dict_representation["fft"]) obj.add_task_result("number_picture_converted", len(converted_image_list)) obj.add_task_result("number_of_picture_total", len(image_list))
def write_metadata_local(inputfile, outputfile, metadata_dictionary, verbose): """ Metadata write method, takes the .pdf as input and creates a new one with the new info. @param inputfile: path to the pdf @type inputfile: string @param outputfile: path to the resulting pdf @type outputfile: string @param verbose: verbosity @type verbose: int @param metadata_dictionary: metadata information to update inputfile @type metadata_dictionary: dict """ # Take the file name (0 base, 1 name, 2 ext) filename = decompose_file(inputfile)[1] # Print pdf metadata if verbose > 1: print('Metadata information in the PDF file ' + filename + ': \n') try: os.system(CFG_PATH_PDFTK + ' ' + inputfile + ' dump_data') except Exception: print('Problem with inputfile to PDFTK') # Info file for pdftk (fd, path_to_info) = tempfile.mkstemp(prefix="wsm_pdf_plugin_info_", \ dir=CFG_TMPDIR) os.close(fd) file_in = open(path_to_info, 'w') if verbose > 5: print("Saving PDFTK info file to %s" % path_to_info) # User interaction to form the info file # Main Case: Dictionary received through option -d if not metadata_dictionary == {}: for tag in metadata_dictionary: line = 'InfoKey: ' + tag + '\nInfoValue: ' + \ metadata_dictionary[tag] + '\n' if verbose > 0: print(line) file_in.writelines(line) else: data_modified = False user_input = 'user_input' print("Entering interactive mode. Choose what you want to do:") while (user_input): if not data_modified: try: user_input = raw_input('[w]rite / [q]uit\n') except: print("Aborting") return else: try: user_input = raw_input( '[w]rite / [q]uit and apply / [a]bort \n') except: print("Aborting") return if user_input == 'q': if not data_modified: return break elif user_input == 'w': try: tag = raw_input('Tag to update:\n') value = raw_input('With value:\n') except: print("Aborting") return # Write to info file line = 'InfoKey: ' + tag + '\nInfoValue: ' + value + '\n' data_modified = True file_in.writelines(line) elif user_input == 'a': return else: print("Invalid option: ") file_in.close() (fd, pdf_temp_path) = tempfile.mkstemp(prefix="wsm_pdf_plugin_pdf_", \ dir=CFG_TMPDIR) os.close(fd) # Now we call pdftk tool to update the info on a pdf #try: cmd_pdftk = '%s %s update_info %s output %s' (exit_status, output_std, output_err) = \ run_shell_command(cmd_pdftk, args=(CFG_PATH_PDFTK, inputfile, path_to_info, pdf_temp_path)) if verbose > 5: print(output_std, output_err) if os.path.exists(pdf_temp_path): # Move to final destination if exist try: shutil.move(pdf_temp_path, outputfile) except Exception as err: raise InvenioWebSubmitFileMetadataRuntimeError("Could not move %s to %s" % \ (pdf_temp_path, outputfile)) else: # Something bad happened raise InvenioWebSubmitFileMetadataRuntimeError( "Could not update metadata " + output_err)
def untar(original_tarball, sdir): """ Here we decide if our file is actually a tarball (sometimes the 'tarballs' gotten from arXiv aren't actually tarballs. If they 'contain' only the TeX file, then they are just that file.), then we untar it if so and decide which of its constituents are the TeX file and which are the images. @param: tarball (string): the name of the tar file from arXiv @param: dir (string): the directory where we would like it untarred to @return: (image_list, tex_file) (([string, string, ...], string)): list of images in the tarball and the name of the TeX file in the tarball. """ tarball = check_for_gzip(original_tarball) dummy1, cmd_out, cmd_err = run_shell_command('file %s', (tarball, )) tarball_output = 'tar archive' if re.search(tarball_output, cmd_out) == None: run_shell_command('rm %s', (tarball, )) return ([], [], None) cmd_list = ['tar', 'xvf', tarball, '-C', sdir] dummy1, cmd_out, cmd_err = run_process_with_timeout(cmd_list) if cmd_err != '': return ([], [], None) if original_tarball != tarball: run_shell_command('rm %s', (tarball, )) cmd_out = cmd_out.split('\n') tex_output_contains = 'TeX' tex_file_extension = 'tex' image_output_contains = 'image' eps_output_contains = '- type eps' ps_output_contains = 'Postscript' file_list = [] image_list = [] might_be_tex = [] for extracted_file in cmd_out: if extracted_file == '': break if extracted_file.startswith('./'): extracted_file = extracted_file[2:] # ensure we are actually looking at the right file extracted_file = os.path.join(sdir, extracted_file) # Add to full list of extracted files file_list.append(extracted_file) dummy1, cmd_out, dummy2 = run_shell_command('file %s', (extracted_file, )) # is it TeX? if cmd_out.find(tex_output_contains) > -1: might_be_tex.append(extracted_file) # is it an image? elif cmd_out.lower().find(image_output_contains) > cmd_out.find(':') \ or \ cmd_out.lower().find(eps_output_contains) > cmd_out.find(':')\ or \ cmd_out.find(ps_output_contains) > cmd_out.find(':'): # we have "image" in the output, and it is not in the filename # i.e. filename.ext: blah blah image blah blah image_list.append(extracted_file) # if neither, maybe it is TeX or an image anyway, otherwise, # we don't care else: if extracted_file.split('.')[-1].lower() == tex_file_extension: # we might have tex source! might_be_tex.append(extracted_file) elif extracted_file.split('.')[-1] in ['eps', 'png', \ 'ps', 'jpg', 'pdf']: # we might have an image! image_list.append(extracted_file) if might_be_tex == []: # well, that's tragic # could not find TeX file in tar archive return ([], [], []) return (file_list, image_list, might_be_tex)
def cli_upload(req, file_content=None, mode=None, callback_url=None, nonce=None, special_treatment=None): """ Robot interface for uploading MARC files """ req.content_type = "text/plain" # check IP and useragent: if not _get_client_authorized_collections(_get_client_ip(req)): msg = "[ERROR] Sorry, client IP %s cannot use the service." % _get_client_ip(req) _log(msg) req.status = HTTP_FORBIDDEN return _write(req, msg) if not _check_client_useragent(req): msg = "[ERROR] Sorry, the %s useragent cannot use the service." % _get_useragent(req) _log(msg) req.status = HTTP_FORBIDDEN return _write(req, msg) arg_mode = mode if not arg_mode: msg = "[ERROR] Please specify upload mode to use." _log(msg) req.status = HTTP_BAD_REQUEST return _write(req, msg) if arg_mode == '--insertorreplace': arg_mode = '-ir' if not arg_mode in PERMITTED_MODES: msg = "[ERROR] Invalid upload mode." _log(msg) req.status = HTTP_BAD_REQUEST return _write(req, msg) arg_file = file_content if hasattr(arg_file, 'read'): ## We've been passed a readable file, e.g. req arg_file = arg_file.read() if not arg_file: msg = "[ERROR] Please provide a body to your request." _log(msg) req.status = HTTP_BAD_REQUEST return _write(req, msg) else: if not arg_file: msg = "[ERROR] Please specify file body to input." _log(msg) req.status = HTTP_BAD_REQUEST return _write(req, msg) if hasattr(arg_file, "filename"): arg_file = arg_file.value else: msg = "[ERROR] 'file' parameter must be a (single) file" _log(msg) req.status = HTTP_BAD_REQUEST return _write(req, msg) # write temporary file: (fd, filename) = tempfile.mkstemp(prefix="batchupload_" + \ time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_", dir=CFG_TMPSHAREDDIR) filedesc = os.fdopen(fd, 'w') filedesc.write(arg_file) filedesc.close() # check if this client can run this file: client_ip = _get_client_ip(req) permitted_dbcollids = _get_client_authorized_collections(client_ip) if '*' not in permitted_dbcollids: # wildcard allow = _check_client_can_submit_file(client_ip, filename, req, 0) if not allow: msg = "[ERROR] Cannot submit such a file from this IP. (Wrong collection.)" _log(msg) req.status = HTTP_FORBIDDEN return _write(req, msg) # check validity of marcxml xmlmarclint_path = CFG_BINDIR + '/xmlmarclint' xmlmarclint_output, dummy1, dummy2 = run_shell_command('%s %s' % (xmlmarclint_path, filename)) if xmlmarclint_output != 0: msg = "[ERROR] MARCXML is not valid." _log(msg) req.status = HTTP_BAD_REQUEST return _write(req, msg) args = ['bibupload', "batchupload", arg_mode, filename] # run upload command if callback_url: args += ["--callback-url", callback_url] if nonce: args += ["--nonce", nonce] if special_treatment: args += ["--special-treatment", special_treatment] task_low_level_submission(*args) msg = "[INFO] %s" % ' '.join(args) _log(msg) return _write(req, msg)
def dump_database(dump_path, host=CFG_DATABASE_HOST, port=CFG_DATABASE_PORT, \ user=CFG_DATABASE_USER, passw=CFG_DATABASE_PASS, \ name=CFG_DATABASE_NAME, params=None, compress=False, \ ignore_tables=None): """ Dump Invenio database into SQL file located at DUMP_PATH. Will perform the command to mysqldump with the given host configuration and user credentials. Optional mysqldump parameters can also be passed. Otherwise, a default set of parameters will be used. @param dump_path: path on the filesystem to save the dump to. @type dump_path: string @param host: hostname of mysql database node to connect to. @type host: string @param port: port of mysql database node to connect to @type port: string @param user: username to connect with @type user: string @param passw: password to connect to with @type passw: string @param name: name of mysql database node to dump @type name: string @param params: command line parameters to pass to mysqldump. Optional. @type params: string @param compress: should the dump be compressed through gzip? @type compress: bool @param ignore_tables: list of tables to ignore in the dump @type ignore: list of string """ write_message("... writing %s" % (dump_path,)) partial_dump_path = dump_path + ".part" # Is mysqldump installed or in the right path? cmd_prefix = CFG_PATH_MYSQL + 'dump' if not os.path.exists(cmd_prefix): raise StandardError("%s is not installed." % (cmd_prefix)) if not params: # No parameters set, lets use the default ones. params = " --skip-opt --add-drop-table --add-locks --create-options" \ " --quick --extended-insert --set-charset --disable-keys" \ " --lock-tables=false --max_allowed_packet=2G " if ignore_tables: params += " ".join([escape_shell_arg("--ignore-table=%s.%s" % (CFG_DATABASE_NAME, table)) for table in ignore_tables]) dump_cmd = "%s %s " \ " --host=%s --port=%s --user=%s --password=%s %s" % \ (cmd_prefix, \ params, \ escape_shell_arg(host), \ escape_shell_arg(str(port)), \ escape_shell_arg(user), \ escape_shell_arg(passw), \ escape_shell_arg(name)) if compress: dump_cmd = "%s | %s -cf; exit ${PIPESTATUS[0]}" % \ (dump_cmd, \ CFG_PATH_GZIP) dump_cmd = "bash -c %s" % (escape_shell_arg(dump_cmd),) write_message(dump_cmd, verbose=2) exit_code, stdout, stderr = run_shell_command(dump_cmd, None, partial_dump_path) if exit_code: raise StandardError("ERROR: mysqldump exit code is %s. stderr: %s stdout: %s" % \ (repr(exit_code), \ repr(stderr), \ repr(stdout))) else: os.rename(partial_dump_path, dump_path) write_message("... completed writing %s" % (dump_path,))
def untar(original_tarball, sdir): """ Here we decide if our file is actually a tarball (sometimes the 'tarballs' gotten from arXiv aren't actually tarballs. If they 'contain' only the TeX file, then they are just that file.), then we untar it if so and decide which of its constituents are the TeX file and which are the images. @param: tarball (string): the name of the tar file from arXiv @param: dir (string): the directory where we would like it untarred to @return: (image_list, tex_file) (([string, string, ...], string)): list of images in the tarball and the name of the TeX file in the tarball. """ if not tarfile.is_tarfile(original_tarball): return ([], [], None) tarball = tarfile.open(original_tarball) tarball.extractall(sdir) tex_output_contains = 'TeX' tex_file_extension = 'tex' image_output_contains = 'image' eps_output_contains = '- type eps' ps_output_contains = 'Postscript' file_list = [] image_list = [] might_be_tex = [] for extracted_file in tarball.getnames(): if extracted_file == '': break if extracted_file.startswith('./'): extracted_file = extracted_file[2:] # ensure we are actually looking at the right file extracted_file = os.path.join(sdir, extracted_file) # Add to full list of extracted files file_list.append(extracted_file) dummy1, cmd_out, dummy2 = run_shell_command('file %s', (extracted_file,)) # is it TeX? if cmd_out.find(tex_output_contains) > -1: might_be_tex.append(extracted_file) # is it an image? elif cmd_out.lower().find(image_output_contains) > cmd_out.find(':') \ or \ cmd_out.lower().find(eps_output_contains) > cmd_out.find(':')\ or \ cmd_out.find(ps_output_contains) > cmd_out.find(':'): # we have "image" in the output, and it is not in the filename # i.e. filename.ext: blah blah image blah blah image_list.append(extracted_file) # if neither, maybe it is TeX or an image anyway, otherwise, # we don't care else: if extracted_file.split('.')[-1].lower() == tex_file_extension: # we might have tex source! might_be_tex.append(extracted_file) elif extracted_file.split('.')[-1] in ['eps', 'png', \ 'ps', 'jpg', 'pdf']: # we might have an image! image_list.append(extracted_file) if might_be_tex == []: # well, that's tragic # could not find TeX file in tar archive return ([], [], []) return (file_list, image_list, might_be_tex)
def Move_Files_to_Storage(parameters, curdir, form, user_info=None): """ The function moves files received from the standard submission's form through file input element(s). The document are assigned a 'doctype' (or category) corresponding to the file input element (eg. a file uploaded throught 'DEMOPIC_FILE' will go to 'DEMOPIC_FILE' doctype/category). Websubmit engine builds the following file organization in the directory curdir/files: curdir/files | _____________________________________________________________________ | | | ./file input 1 element's name ./file input 2 element's name .... (for eg. 'DEMOART_MAILFILE') (for eg. 'DEMOART_APPENDIX') | | test1.pdf test2.pdf There is only one instance of all possible extension(pdf, gz...) in each part otherwise we may encounter problems when renaming files. + parameters['rename']: if given, all the files in curdir/files are renamed. parameters['rename'] is of the form: <PA>elemfilename[re]</PA>* where re is an regexp to select(using re.sub) what part of the elem file has to be selected. e.g: <PA>file:TEST_FILE_RN</PA> + parameters['documenttype']: if given, other formats are created. It has 2 possible values: - if "picture" icon in gif format is created - if "fulltext" ps, gz .... formats are created + parameters['paths_and_suffixes']: directories to look into and corresponding suffix to add to every file inside. It must have the same structure as a Python dictionnary of the following form {'FrenchAbstract':'french', 'EnglishAbstract':''} The keys are the file input element name from the form <=> directories in curdir/files The values associated are the suffixes which will be added to all the files in e.g. curdir/files/FrenchAbstract + parameters['iconsize'] need only if 'icon' is selected in parameters['documenttype'] + parameters['paths_and_restrictions']: the restrictions to apply to each uploaded file. The parameter must have the same structure as a Python dictionnary of the following form: {'DEMOART_APPENDIX':'restricted'} Files not specified in this parameter are not restricted. The specified restrictions can include a variable that can be replaced at runtime, for eg: {'DEMOART_APPENDIX':'restricted to <PA>file:SuE</PA>'} + parameters['paths_and_doctypes']: if a doctype is specified, the file will be saved under the 'doctype/collection' instead of under the default doctype/collection given by the name of the upload element that was used on the websubmit interface. to configure the doctype in websubmit, enter the value as in a dictionnary, for eg: {'PATHS_SWORD_UPL' : 'PUSHED_TO_ARXIV'} -> from Demo_Export_Via_Sword [DEMOSWR] Document Types """ global sysno paths_and_suffixes = parameters['paths_and_suffixes'] paths_and_restrictions = parameters['paths_and_restrictions'] rename = parameters['rename'] documenttype = parameters['documenttype'] iconsizes = parameters['iconsize'].split(',') paths_and_doctypes = parameters['paths_and_doctypes'] ## Create an instance of BibRecDocs for the current recid(sysno) bibrecdocs = BibRecDocs(sysno) paths_and_suffixes = get_dictionary_from_string(paths_and_suffixes) paths_and_restrictions = get_dictionary_from_string(paths_and_restrictions) paths_and_doctypes = get_dictionary_from_string(paths_and_doctypes) ## Go through all the directories specified in the keys ## of parameters['paths_and_suffixes'] for path in paths_and_suffixes.keys(): ## Check if there is a directory for the current path if os.path.exists("%s/files/%s" % (curdir, path)): ## Retrieve the restriction to apply to files in this ## directory restriction = paths_and_restrictions.get(path, '') restriction = re.sub('<PA>(?P<content>[^<]*)</PA>', get_pa_tag_content, restriction) ## Go through all the files in curdir/files/path for current_file in os.listdir("%s/files/%s" % (curdir, path)): ## retrieve filename and extension dummy, filename, extension = decompose_file(current_file) if extension and extension[0] != ".": extension = '.' + extension if len(paths_and_suffixes[path]) != 0: extension = "_%s%s" % (paths_and_suffixes[path], extension) ## Build the new file name if rename parameter has been given if rename: filename = re.sub('<PA>(?P<content>[^<]*)</PA>', \ get_pa_tag_content, \ parameters['rename']) if rename or len(paths_and_suffixes[path]) != 0: ## Rename the file try: # Write the log rename_cmd fd = open("%s/rename_cmd" % curdir, "a+") fd.write("%s/files/%s/%s" % (curdir, path, current_file) + " to " +\ "%s/files/%s/%s%s" % (curdir, path, filename, extension) + "\n\n") ## Rename os.rename("%s/files/%s/%s" % (curdir, path, current_file), \ "%s/files/%s/%s%s" % (curdir, path, filename, extension)) fd.close() ## Save the new name in a text file in curdir so that ## the new filename can be used by templates to created the recmysl fd = open("%s/%s_RENAMED" % (curdir, path), "w") fd.write("%s%s" % (filename, extension)) fd.close() except OSError as err: msg = "Cannot rename the file.[%s]" msg %= str(err) raise InvenioWebSubmitFunctionWarning(msg) fullpath = "%s/files/%s/%s%s" % (curdir, path, filename, extension) ## Check if there is any existing similar file if not bibrecdocs.check_file_exists(fullpath, extension): bibdoc = bibrecdocs.add_new_file( fullpath, doctype=paths_and_doctypes.get(path, path), never_fail=True) bibdoc.set_status(restriction) ## Fulltext if documenttype == "fulltext": additionalformats = createRelatedFormats(fullpath) if len(additionalformats) > 0: for additionalformat in additionalformats: try: bibrecdocs.add_new_format(additionalformat) except InvenioBibDocFileError: pass ## Icon elif documenttype == "picture": has_added_default_icon_subformat_p = False for iconsize in iconsizes: try: iconpath, iconname = create_icon({ 'input-file': fullpath, 'icon-scale': iconsize, 'icon-name': None, 'icon-file-format': None, 'multipage-icon': False, 'multipage-icon-delay': 100, 'verbosity': 0, }) except Exception as e: register_exception( prefix= 'Impossible to create icon for %s (record %s)' % (fullpath, sysno), alert_admin=True) continue iconpath = os.path.join(iconpath, iconname) docname = decompose_file(fullpath)[1] try: mybibdoc = bibrecdocs.get_bibdoc(docname) except InvenioBibDocFileError: mybibdoc = None if iconpath is not None and mybibdoc is not None: try: icon_suffix = iconsize.replace( '>', '').replace('<', '').replace( '^', '').replace('!', '') if not has_added_default_icon_subformat_p: mybibdoc.add_icon(iconpath) has_added_default_icon_subformat_p = True else: mybibdoc.add_icon( iconpath, subformat= CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT + "-" + icon_suffix) ## Save the new icon filename in a text file in curdir so that ## it can be used by templates to created the recmysl try: if not has_added_default_icon_subformat_p: fd = open( "%s/%s_ICON" % (curdir, path), "w") else: fd = open( "%s/%s_ICON_%s" % (curdir, path, iconsize + '_' + icon_suffix), "w") fd.write(os.path.basename(iconpath)) fd.close() except OSError as err: msg = "Cannot store icon filename.[%s]" msg %= str(err) raise InvenioWebSubmitFunctionWarning( msg) except InvenioBibDocFileError as e: # Most probably icon already existed. pass elif mybibdoc is not None: mybibdoc.delete_icon() # Update the MARC bibdocfile_bin = os.path.join(CFG_BINDIR, 'bibdocfile --yes-i-know') run_shell_command(bibdocfile_bin + " --fix-marc --recid=%s", (str(sysno), )) # Delete the HB BibFormat cache in the DB, so that the fulltext # links do not point to possible dead files run_sql( "DELETE LOW_PRIORITY from bibfmt WHERE format='HB' AND id_bibrec=%s", (sysno, )) return ""
def cli_upload(req, file_content=None, mode=None, callback_url=None, nonce=None, special_treatment=None): """ Robot interface for uploading MARC files """ req.content_type = "text/plain" # check IP and useragent: if not _get_client_authorized_collections(_get_client_ip(req)): msg = "[ERROR] Sorry, client IP %s cannot use the service." % _get_client_ip( req) _log(msg) req.status = HTTP_FORBIDDEN return _write(req, msg) if not _check_client_useragent(req): msg = "[ERROR] Sorry, the %s useragent cannot use the service." % _get_useragent( req) _log(msg) req.status = HTTP_FORBIDDEN return _write(req, msg) arg_mode = mode if not arg_mode: msg = "[ERROR] Please specify upload mode to use." _log(msg) req.status = HTTP_BAD_REQUEST return _write(req, msg) if arg_mode == '--insertorreplace': arg_mode = '-ir' if not arg_mode in PERMITTED_MODES: msg = "[ERROR] Invalid upload mode." _log(msg) req.status = HTTP_BAD_REQUEST return _write(req, msg) arg_file = file_content if hasattr(arg_file, 'read'): ## We've been passed a readable file, e.g. req arg_file = arg_file.read() if not arg_file: msg = "[ERROR] Please provide a body to your request." _log(msg) req.status = HTTP_BAD_REQUEST return _write(req, msg) else: if not arg_file: msg = "[ERROR] Please specify file body to input." _log(msg) req.status = HTTP_BAD_REQUEST return _write(req, msg) if hasattr(arg_file, "filename"): arg_file = arg_file.value else: msg = "[ERROR] 'file' parameter must be a (single) file" _log(msg) req.status = HTTP_BAD_REQUEST return _write(req, msg) # write temporary file: (fd, filename) = tempfile.mkstemp(prefix="batchupload_" + \ time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_", dir=CFG_TMPSHAREDDIR) filedesc = os.fdopen(fd, 'w') filedesc.write(arg_file) filedesc.close() # check if this client can run this file: client_ip = _get_client_ip(req) permitted_dbcollids = _get_client_authorized_collections(client_ip) if '*' not in permitted_dbcollids: # wildcard allow = _check_client_can_submit_file(client_ip, filename, req, 0) if not allow: msg = "[ERROR] Cannot submit such a file from this IP. (Wrong collection.)" _log(msg) req.status = HTTP_FORBIDDEN return _write(req, msg) # check validity of marcxml xmlmarclint_path = CFG_BINDIR + '/xmlmarclint' xmlmarclint_output, dummy1, dummy2 = run_shell_command( '%s %s' % (xmlmarclint_path, filename)) if xmlmarclint_output != 0: msg = "[ERROR] MARCXML is not valid." _log(msg) req.status = HTTP_BAD_REQUEST return _write(req, msg) args = ['bibupload', "batchupload", arg_mode, filename] # run upload command if callback_url: args += ["--callback-url", callback_url] if nonce: args += ["--nonce", nonce] if special_treatment: args += ["--special-treatment", special_treatment] task_low_level_submission(*args) msg = "[INFO] %s" % ' '.join(args) _log(msg) return _write(req, msg)