def test_insert_via_curl(self): """batchuploader - robotupload insert via CLI curl""" curl_input_file = os.path.join(CFG_TMPDIR, 'curl_test.xml') open(curl_input_file, "w").write(self.marcxml) try: result = run_shell_command( '/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"', [ curl_input_file, self.nonce_url, make_user_agent_string('BatchUploader') ])[1] self.failUnless("[INFO]" in result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) results = json.loads( open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results['results']), 1) self.assertEqual(results['nonce'], "1234") self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless( """<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml']) finally: os.remove(curl_input_file)
def test_legacy_insert_via_curl(self): """batchuploader - robotupload legacy insert via CLI curl""" curl_input_file = os.path.join(CFG_TMPDIR, 'curl_test.xml') open(curl_input_file, "w").write(self.marcxml) try: ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload code, result, err = run_shell_command( "/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s", [ curl_input_file, self.callback_url, self.legacy_url, make_user_agent_string('BatchUploader') ]) self.failUnless( "[INFO]" in result, '[INFO] not find in results: %s, %s' % (result, err)) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) results = json.loads( open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results['results']), 1) self.assertEqual(results['nonce'], "1234") self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless( """<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml']) finally: os.remove(curl_input_file)
def test_legacy_insert_via_curl(self): """batchuploader - robotupload legacy insert via CLI curl""" curl_input_file = os.path.join(CFG_TMPDIR, "curl_test.xml") open(curl_input_file, "w").write(self.marcxml) try: ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload code, result, err = run_shell_command( "/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s", [curl_input_file, self.callback_url, self.legacy_url, make_user_agent_string("BatchUploader")], ) self.failUnless("[INFO]" in result, "[INFO] not find in results: %s, %s" % (result, err)) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless("results" in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results["results"]), 1) self.assertEqual(results["nonce"], "1234") self.failUnless(results["results"][0]["success"]) self.failUnless(results["results"][0]["recid"] > 0) self.failUnless( """<subfield code="a">Doe, John</subfield>""" in results["results"][0]["marcxml"], results["results"][0]["marcxml"], ) finally: os.remove(curl_input_file)
def test_get_defaults(self): """plotextractor - get defaults""" sdir_should_be = os.path.join(CFG_TMPDIR, self.arXiv_id + "_plots") refno_should_be = "15" # Note: For ATLANTIS DEMO site sdir, refno = get_defaults(tarball=self.tarball, sdir=None, refno_url=CFG_SITE_URL) if sdir != None: run_shell_command("rm -rf %s" % (sdir,)) self.assertTrue(sdir == sdir_should_be, "didn't get correct default scratch dir") self.assertTrue(refno == refno_should_be, "didn't get correct default reference number")
def process_single(tarball, sdir=CFG_TMPDIR, xtract_text=False, \ upload_plots=True, force=False, squash=False): ''' Processes one tarball end-to-end. @param: tarball (string): the absolute location of the tarball we wish to process @param: sdir (string): where we should put all the intermediate files for the processing. if you're uploading, this directory should be one of the ones specified in CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS, else the upload won't work @param: xtract_text (boolean): true iff you want to run pdftotext on the pdf versions of the tarfiles. this programme assumes that the pdfs are named the same as the tarballs but with a .pdf extension. @param: upload_plots (boolean): true iff you want to bibupload the plots extracted by this process @return: None ''' sub_dir, refno = get_defaults(tarball, sdir) if not squash: marc_name = os.path.join(sub_dir, refno + '.xml') else: marc_name = os.path.join(sdir, SQUASHED_FILE) if (force or not os.path.exists(marc_name)) and not squash: open(marc_name, 'w').close() if xtract_text: extract_text(tarball) image_list, tex_files = untar(tarball, sub_dir) if tex_files == [] or tex_files == None: write_message(os.path.split(tarball)[-1] + ' is not a tarball') run_shell_command('rm -r ' + sub_dir) return converted_image_list = convert_images(image_list) images_and_captions_and_labels = [['','', []]] for tex_file in tex_files: images_and_captions_and_labels.extend(extract_captions(tex_file, sub_dir, converted_image_list)) marc_name = create_MARC(images_and_captions_and_labels, tex_files[0], refno, converted_image_list, marc_name) if marc_name != None and not squash: write_message('generated ' + marc_name) if upload_plots: upload_to_site(marc_name) clean_up(image_list) write_message('work complete on ' + os.path.split(tarball)[-1])
def extract_plots_from_latex(url_tarball): tarball = download_url(url_tarball) path, name = os.path.split(url_tarball) tarball += name run_shell_command(CFG_BINDIR + '/plotextractor -t ' + tarball) path, name = os.path.split(tarball) plotextracted_xml_path = tarball + '_plots/' + name + '.xml' return plotextracted_xml_path
def test_simple_insert(self): """batchuploader - robotupload simple insert""" result = urllib2.urlopen(self.req).read() self.failUnless("[INFO]" in result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) current_recid = run_sql("SELECT MAX(id) FROM bibrec")[0][0] self.failIfEqual(self.last_recid, current_recid) record = get_record(current_recid) self.assertEqual(record['245'][0][0], [('a', 'The title')])
def check_system(self, uid=None): """return an error string if there are problems""" if uid: rtuid, rtpw = get_bibcat_from_prefs(uid) else: # Assume default RT user rtuid = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER rtpw = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD if not rtuid and not rtpw: return "No valid RT user login specified" if not CFG_BIBCATALOG_SYSTEM == 'RT': return "CFG_BIBCATALOG_SYSTEM is not RT though this is an RT module" if not CFG_BIBCATALOG_SYSTEM_RT_CLI: return "CFG_BIBCATALOG_SYSTEM_RT_CLI not defined or empty" if not os.path.exists(CFG_BIBCATALOG_SYSTEM_RT_CLI): return "CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " file does not exists" # Check that you can execute the binary.. this is a safe call unless someone can fake CFG_BIBCATALOG_SYSTEM_RT_CLI (unlikely) dummy, myout, myerr = run_shell_command(CFG_BIBCATALOG_SYSTEM_RT_CLI + " help") helpfound = False if myerr.count("help") > 0: helpfound = True if not helpfound: return "Execution of CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " help did not produce output 'help'" if not CFG_BIBCATALOG_SYSTEM_RT_URL: return "CFG_BIBCATALOG_SYSTEM_RT_URL not defined or empty" # Construct URL, split RT_URL at // if not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('http://') and \ not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('https://'): return "CFG_BIBCATALOG__SYSTEM_RT_URL does not start with 'http://' or 'https://'" httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//") # Assemble by http://user:password@RT_URL bibcatalog_rt_server = httppart + "//" + rtuid + ":" + rtpw + "@" + siteandpath #set as env var os.environ["RTUSER"] = rtuid os.environ["RTSERVER"] = bibcatalog_rt_server #try to talk to RT server #this is a safe call since rtpw is the only variable in it, and it is escaped rtpw = escape_shell_arg(rtpw) dummy, myout, myerr = run_shell_command("echo " + rtpw + " | " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " ls \"Subject like 'F00'\"") if len(myerr) > 0: return "could not connect to " + bibcatalog_rt_server + " " + myerr #finally, check that there is some sane output like tickets or 'No matching result' saneoutput = (myout.count('matching') > 0) or (myout.count('1') > 0) if not saneoutput: return CFG_BIBCATALOG_SYSTEM_RT_CLI + " returned " + myout + " instead of 'matching' or '1'" return ""
def test_get_defaults(self): sdir = None sdir_should_be = os.path.join(CFG_TMPDIR, self.arXiv_id + '_plots') sdir, refno = get_defaults(self.tarball, sdir, "http://inspirebeta.net") if sdir != None: run_shell_command("rm -rf %s" % (sdir,)) self.assertTrue(sdir == sdir_should_be, \ "didn\'t get correct default scratch dir") self.assertTrue(refno == "812227", \ 'didn\'t get correct default reference number')
def clean_up(image_list): ''' Removes all the intermediate stuff. @param: image_list ([string, string, ...]): the images to remove NOTE: when running this for later upload, it's not a good idea to remove the converted images! ''' return # FIXME do not delete image files before upload for image_file in image_list: run_shell_command('rm ' + image_file)
def test_get_defaults(self): """plotextractor - get defaults""" sdir = None sdir_should_be = os.path.join(CFG_TMPDIR, self.arXiv_id + '_plots') sdir, refno = get_defaults(self.tarball, sdir, "http://inspirehep.net") if sdir != None: run_shell_command("rm -rf %s" % (sdir, )) self.assertTrue(sdir == sdir_should_be, \ "didn\'t get correct default scratch dir") self.assertTrue(refno == "812227", \ 'didn\'t get correct default reference number')
def test_insert_with_callback(self): """batchuploader - robotupload insert with callback""" result = urllib2.urlopen(self.req_callback).read() self.failUnless("[INFO]" in result, '"%s" did not contained [INFO]' % result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless('results' in results) self.assertEqual(len(results['results']), 1) self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
def clean_up(extracted_files_list, image_list): """ Removes all the intermediate stuff. @param: extracted_files_list ([string, string, ...]): list of all extracted files @param: image_list ([string, string, ...]): list of the images to keep """ for extracted_file in extracted_files_list: # Remove everything that is not in the image_list or is not a directory if extracted_file not in image_list and extracted_file[-1] != os.sep: run_shell_command('rm %s', (extracted_file,))
def check_system(self, uid): """return an error string if there are problems""" user_pref = invenio.webuser.get_user_preferences(uid) if not user_pref.has_key('bibcatalog_username'): return "user " + str(uid) + " has no bibcatalog_username" rtuid = user_pref['bibcatalog_username'] if not user_pref.has_key('bibcatalog_password'): return "user " + str(uid) + " has no bibcatalog_password" rtpw = user_pref['bibcatalog_password'] if not CFG_BIBCATALOG_SYSTEM == 'RT': return "CFG_BIBCATALOG_SYSTEM is not RT though this is an RT module" if not CFG_BIBCATALOG_SYSTEM_RT_CLI: return "CFG_BIBCATALOG_SYSTEM_RT_CLI not defined or empty" if not os.path.exists(CFG_BIBCATALOG_SYSTEM_RT_CLI): return "CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " file does not exists" #check that you can execute it.. this is a safe call unless someone can fake CFG_BIBCATALOG_SYSTEM_RT_CLI (unlikely) dummy, myout, myerr = run_shell_command(CFG_BIBCATALOG_SYSTEM_RT_CLI + " help") helpfound = False if myerr.count("help") > 0: helpfound = True if not helpfound: return "Execution of CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " help did not produce output 'help'" if not CFG_BIBCATALOG_SYSTEM_RT_URL: return "CFG_BIBCATALOG_SYSTEM_RT_URL not defined or empty" #construct.. split RT_URL at // if not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('http://') and \ not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('https://'): return "CFG_BIBCATALOG__SYSTEM_RT_URL does not start with 'http://' or 'https://'" httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//") BIBCATALOG_RT_SERVER = httppart + "//" + rtuid + ":" + rtpw + "@" + siteandpath #set as env var os.environ["RTUSER"] = rtuid os.environ["RTSERVER"] = BIBCATALOG_RT_SERVER #try to talk to RT server #this is a safe call since rtpw is the only variable in it, and it is escaped rtpw = escape_shell_arg(rtpw) dummy, myout, myerr = run_shell_command("echo " + rtpw + " | " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " ls \"Subject like 'F00'\"") if len(myerr) > 0: return "could not connect to " + BIBCATALOG_RT_SERVER + " " + myerr #finally, check that there is some sane output like tickets or 'No matching result' saneoutput = (myout.count('matching') > 0) or (myout.count('1') > 0) if not saneoutput: return CFG_BIBCATALOG_SYSTEM_RT_CLI + " returned " + myout + " instead of 'matching' or '1'" if not CFG_BIBCATALOG_QUEUES: return "CFG_BIBCATALOG_QUEUES not defined or empty" (username, dummy) = get_bibcat_from_prefs(uid) if (username is None): return "Cannot find user preference bibcatalog_username for uid " + str( uid) return ""
def check_system(self, uid=None): """return an error string if there are problems""" if uid: rtuid, rtpw = get_bibcat_from_prefs(uid) else: # Assume default RT user rtuid = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER rtpw = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD if not rtuid and not rtpw: return "No valid RT user login specified" if not CFG_BIBCATALOG_SYSTEM == 'RT': return "CFG_BIBCATALOG_SYSTEM is not RT though this is an RT module" if not CFG_BIBCATALOG_SYSTEM_RT_CLI: return "CFG_BIBCATALOG_SYSTEM_RT_CLI not defined or empty" if not os.path.exists(CFG_BIBCATALOG_SYSTEM_RT_CLI): return "CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " file does not exists" # Check that you can execute the binary.. this is a safe call unless someone can fake CFG_BIBCATALOG_SYSTEM_RT_CLI (unlikely) dummy, myout, myerr = run_shell_command(CFG_BIBCATALOG_SYSTEM_RT_CLI + " help") helpfound = False if myerr.count("help") > 0: helpfound = True if not helpfound: return "Execution of CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " help did not produce output 'help'" if not CFG_BIBCATALOG_SYSTEM_RT_URL: return "CFG_BIBCATALOG_SYSTEM_RT_URL not defined or empty" # Construct URL, split RT_URL at // if not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('http://') and \ not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('https://'): return "CFG_BIBCATALOG__SYSTEM_RT_URL does not start with 'http://' or 'https://'" httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//") # Assemble by http://user:password@RT_URL bibcatalog_rt_server = httppart + "//" + rtuid + ":" + rtpw + "@" + siteandpath #set as env var os.environ["RTUSER"] = rtuid os.environ["RTSERVER"] = bibcatalog_rt_server #try to talk to RT server #this is a safe call since rtpw is the only variable in it, and it is escaped rtpw = escape_shell_arg(rtpw) dummy, myout, myerr = run_shell_command("echo "+rtpw+" | " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " ls \"Subject like 'F00'\"") if len(myerr) > 0: return "could not connect to " + bibcatalog_rt_server + " " + myerr #finally, check that there is some sane output like tickets or 'No matching result' saneoutput = (myout.count('matching') > 0) or (myout.count('1') > 0) if not saneoutput: return CFG_BIBCATALOG_SYSTEM_RT_CLI + " returned " + myout + " instead of 'matching' or '1'" return ""
def test_get_defaults(self): """plotextractor - get defaults""" from invenio.shellutils import run_shell_command from invenio.plotextractor import get_defaults sdir_should_be = os.path.join(CFG_TMPDIR, self.arXiv_id + '_plots') refno_should_be = "15" # Note: For ATLANTIS DEMO site sdir, refno = get_defaults(tarball=self.tarball, sdir=None, refno_url=CFG_SITE_URL) if sdir != None: run_shell_command("rm -rf %s" % (sdir,)) self.assertTrue(sdir == sdir_should_be, \ "didn\'t get correct default scratch dir") self.assertTrue(refno == refno_should_be, \ 'didn\'t get correct default reference number')
def _check_enough_ram(): """ Return if there is enough RAM, i.e. if free outputs more than 1G of ram. """ from invenio.shellutils import run_shell_command try: return int(re.sub(r'\s+', ' ', run_shell_command("free")[1].splitlines()[1]).split(' ')[1]) > 1024 * 1024 except: ## Are we really on Linux? Maybe on a BSD system? try: return int(run_shell_command("sysctl -n hw.memsize")[1]) > 1024 * 1024 except: # Still no luck return False
def check_system(self, uid): """return an error string if there are problems""" user_pref = invenio.webuser.get_user_preferences(uid) if not user_pref.has_key('bibcatalog_username'): return "user " + str(uid) + " has no bibcatalog_username" rtuid = user_pref['bibcatalog_username'] if not user_pref.has_key('bibcatalog_password'): return "user " + str(uid) + " has no bibcatalog_password" rtpw = user_pref['bibcatalog_password'] if not CFG_BIBCATALOG_SYSTEM == 'RT': return "CFG_BIBCATALOG_SYSTEM is not RT though this is an RT module" if not CFG_BIBCATALOG_SYSTEM_RT_CLI: return "CFG_BIBCATALOG_SYSTEM_RT_CLI not defined or empty" if not os.path.exists(CFG_BIBCATALOG_SYSTEM_RT_CLI): return "CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " file does not exists" #check that you can execute it.. this is a safe call unless someone can fake CFG_BIBCATALOG_SYSTEM_RT_CLI (unlikely) dummy, myout, myerr = run_shell_command(CFG_BIBCATALOG_SYSTEM_RT_CLI + " help") helpfound = False if myerr.count("help") > 0: helpfound = True if not helpfound: return "Execution of CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " help did not produce output 'help'" if not CFG_BIBCATALOG_SYSTEM_RT_URL: return "CFG_BIBCATALOG_SYSTEM_RT_URL not defined or empty" #construct.. split RT_URL at // if not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('http://') and \ not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('https://'): return "CFG_BIBCATALOG__SYSTEM_RT_URL does not start with 'http://' or 'https://'" httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//") BIBCATALOG_RT_SERVER = httppart + "//" + rtuid + ":" + rtpw + "@" + siteandpath #set as env var os.environ["RTUSER"] = rtuid os.environ["RTSERVER"] = BIBCATALOG_RT_SERVER #try to talk to RT server #this is a safe call since rtpw is the only variable in it, and it is escaped rtpw = escape_shell_arg(rtpw) dummy, myout, myerr = run_shell_command("echo "+rtpw+" | " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " ls \"Subject like 'F00'\"") if len(myerr) > 0: return "could not connect to " + BIBCATALOG_RT_SERVER + " " + myerr #finally, check that there is some sane output like tickets or 'No matching result' saneoutput = (myout.count('matching') > 0) or (myout.count('1') > 0) if not saneoutput: return CFG_BIBCATALOG_SYSTEM_RT_CLI + " returned " + myout + " instead of 'matching' or '1'" if not CFG_BIBCATALOG_QUEUES: return "CFG_BIBCATALOG_QUEUES not defined or empty" (username, dummy) = get_bibcat_from_prefs(uid) if (username is None): return "Cannot find user preference bibcatalog_username for uid "+str(uid) return ""
def test_insert_with_oracle(self): """batchuploader - robotupload insert with oracle special treatment""" import os if os.path.exists('/opt/invenio/var/log/invenio.err'): os.remove('/opt/invenio/var/log/invenio.err') result = urllib2.urlopen(self.req_oracle).read() self.failUnless("[INFO]" in result, '"%s" did not contained "[INFO]"' % result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained "results" key' % results) self.assertEqual(len(results['results']), 1) self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
def convert_images(image_list): ''' Here we figure out the types of the images that were extracted from the tarball and determine how to convert them into PNG. @param: image_list ([string, string, ...]): the list of image files extracted from the tarball in step 1 @return: image_list ([str, str, ...]): The list of image files when all have been converted to PNG format. ''' png_output_contains = 'PNG image data' ps_output_contains = 'Postscript' eps_output_contains = 'PostScript' ret_list = [] for image_file in image_list: if os.path.isdir(image_file): continue # FIXME: here and everywhere else in the plot extractor # library the run shell command statements should be (1) # called with timeout in order to prevent runaway imagemagick # conversions; (2) the arguments should be passed properly so # that they are escaped. dummy1, cmd_out, dummy2 = run_shell_command('file ' + image_file) if cmd_out.find(png_output_contains) > -1: ret_list.append(image_file) else: # we're just going to assume that ImageMagick can convert all # the image types that we may be faced with # for sure it can do EPS->PNG and JPG->PNG and PS->PNG # and PSTEX->PNG converted_image_file = get_converted_image_name(image_file) convert_cmd = 'convert ' dummy1, cmd_out, cmd_err = run_shell_command(convert_cmd +\ image_file + ' ' + converted_image_file) if cmd_err == '': ret_list.append(converted_image_file) else: write_message('convert failed on ' + image_file) return ret_list
def _normalize_article_dir_with_dtd(self, path): """ main.xml from Elsevier assume the existence of a local DTD. This procedure install the DTDs next to the main.xml file and normalize it using xmllint in order to resolve all namespaces and references. """ if exists(join(path, 'resolved_main.xml')): return if 'art520' in open(join(path, 'main.xml')).read(): ZipFile(CFG_ELSEVIER_ART520_PATH).extractall(path) for filename in listdir(join(path, 'art520')): rename(join(path, 'art520', filename), join(path, filename)) elif 'art501' in open(join(path, 'main.xml')).read(): ZipFile(CFG_ELSEVIER_ART501_PATH).extractall(path) for filename in listdir(join(path, 'art501')): rename(join(path, 'art501', filename), join(path, filename)) else: message = "It looks like the path " + path message += "does not contain an si520 or si501 main.xml file" self.logger.error(message) raise ValueError(message) command = "xmllint --format --loaddtd " + join(path, 'main.xml') command += " --output " + join(path, 'resolved_main.xml') dummy, dummy, cmd_err = run_shell_command(command) if cmd_err: message = "Error in cleaning %s: %s" % ( join(path, 'main.xml'), cmd_err) self.logger.error(message) raise ValueError(message)
def _run_rt_command(self, command, uid=None): """ This function will run a RT CLI command as given user. If no user is specified the default RT user will be used, if configured. Should any of the configuration parameters be missing this function will return None. Otherwise it will return the standard output from the CLI command. @param command: RT CLI command to execute @type command: string @param uid: the Invenio user id to submit on behalf of. Optional. @type uid: int @return: standard output from the command given. None, if any errors. @rtype: string """ if not CFG_BIBCATALOG_SYSTEM_RT_URL: return None if uid: username, passwd = get_bibcat_from_prefs(uid) else: username = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER passwd = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//") bibcatalog_rt_server = httppart + "//" + username + ":" + passwd + "@" + siteandpath #set as env var os.environ["RTUSER"] = username os.environ["RTSERVER"] = bibcatalog_rt_server passwd = escape_shell_arg(passwd) error_code, myout, error_output = run_shell_command("echo " + passwd + " | " + command) if error_code > 0: raise ValueError('Problem running "%s": %d - %s' % (command, error_code, error_output)) return myout
def ticket_set_attribute(self, uid, ticketid, attribute, new_value): """change the ticket's attribute. Returns 1 on success, 0 on failure""" #check that the attribute is accepted.. if attribute not in BibCatalogSystem.TICKET_ATTRIBUTES: return 0 #we cannot change read-only values.. including text that is an attachment. pity if attribute in ['creator', 'date', 'ticketid', 'url_close', 'url_display', 'recordid', 'text']: return 0 #check attribute setme = "" if (attribute == 'priority'): try: dummy = int(new_value) except: return 0 setme = "set Priority=" + str(new_value) if (attribute == 'subject'): subject = escape_shell_arg(new_value) setme = "set Subject='" + subject +"'" if (attribute == 'owner'): #convert from invenio to RT ownerprefs = invenio.webuser.get_user_preferences(new_value) if not ownerprefs.has_key("bibcatalog_username"): return 0 else: owner = escape_shell_arg(ownerprefs["bibcatalog_username"]) setme = " set owner='" + owner +"'" if (attribute == 'status'): setme = " set status='" + escape_shell_arg(new_value) +"'" if (attribute == 'queue'): setme = " set queue='" + escape_shell_arg(new_value) +"'" if not CFG_BIBCATALOG_SYSTEM_RT_URL: return 0 #make sure ticketid is numeric try: dummy = int(ticketid) except: return 0 (username, passwd) = get_bibcat_from_prefs(uid) httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//") BIBCATALOG_RT_SERVER = httppart + "//" + username + ":" + passwd + "@" + siteandpath #set as env var os.environ["RTUSER"] = username os.environ["RTSERVER"] = BIBCATALOG_RT_SERVER passwd = escape_shell_arg(passwd) #make a call. safe since passwd and all variables in 'setme' have been escaped dummy, myout, dummyerr = run_shell_command("echo "+passwd+" | " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " edit ticket/" + str(ticketid) + setme) respOK = False mylines = myout.split("\n") for line in mylines: if line.count('updated') > 0: respOK = True if respOK: return 1 #print str(mylines) return 0
def test_run_cmd_timeout_no_zombie(self): """shellutils - running simple command no zombie""" t1 = time.time() self.assertRaises(Timeout, run_process_with_timeout, (self.script_path, '15', "THISISATEST"), timeout=5) ps_output = run_shell_command('ps aux')[1] self.failIf('THISISATEST' in ps_output) self.failIf('sleep' in ps_output)
def convert_xml_using_saxon(source_file, template_file): """ Tries to convert given source file (full path) using XSLT 2.0 Java libraries. Looks for given XSLT stylesheet/template file (relative path) in CFG_BIBCONVERT_XSL_PATH. Path to converted file is derived from DOI in the same directory as source as decided inside the template file. For example: /path/to/sourcedir/10.1103_PhysRevA.87.052320.xml @raise: APSHarvesterConversionError if Java saxon9he-xslt returns error. @return: True on success. """ if not os.path.isabs(template_file): template_file = CFG_BIBCONVERT_XSL_PATH + os.sep + template_file source_directory = os.path.dirname(source_file) command = "cd %s && saxon9he-xslt -s:%s -xsl:%s -dtd:off" % \ (source_directory, source_file, template_file) exit_code, stdout_buffer, stderr_buffer = run_shell_command(cmd=command) if exit_code or stdout_buffer or stderr_buffer: # Error may have happened raise APSHarvesterConversionError("%s: %s\nOut:%s" % (exit_code, stderr_buffer, stdout_buffer))
def _dump_database(dirname, filename): """ Dump Invenio database into SQL file called FILENAME living in DIRNAME. """ write_message("... writing %s" % dirname + os.sep + filename) cmd = CFG_PATH_MYSQL + 'dump' if not os.path.exists(cmd): write_message("ERROR: cannot find %s." % cmd, stream=sys.stderr) task_update_status("ERROR") sys.exit(1) cmd += " --skip-opt --add-drop-table --add-locks --create-options " \ " --quick --extended-insert --set-charset --disable-keys " \ " --host=%s --user=%s --password=%s %s" % \ (escape_shell_arg(CFG_DATABASE_HOST), escape_shell_arg(CFG_DATABASE_USER), escape_shell_arg(CFG_DATABASE_PASS), escape_shell_arg(CFG_DATABASE_NAME)) dummy1, dummy2, dummy3 = run_shell_command(cmd, None, dirname + os.sep + filename) if dummy1: write_message("ERROR: mysqldump exit code is %s." % repr(dummy1), stream=sys.stderr) task_update_status("ERROR") sys.exit(1) if dummy2: write_message("ERROR: mysqldump stdout is %s." % repr(dummy1), stream=sys.stderr) task_update_status("ERROR") sys.exit(1) if dummy3: write_message("ERROR: mysqldump stderr is %s." % repr(dummy1), stream=sys.stderr) task_update_status("ERROR") sys.exit(1)
def _run_rt_command(self, command, uid=None): """ This function will run a RT CLI command as given user. If no user is specified the default RT user will be used, if configured. Should any of the configuration parameters be missing this function will return None. Otherwise it will return the standard output from the CLI command. @param command: RT CLI command to execute @type command: string @param uid: the Invenio user id to submit on behalf of. Optional. @type uid: int @return: standard output from the command given. None, if any errors. @rtype: string """ if not CFG_BIBCATALOG_SYSTEM_RT_URL: return None if uid: username, passwd = get_bibcat_from_prefs(uid) else: username = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER passwd = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//") BIBCATALOG_RT_SERVER = httppart + "//" + username + ":" + passwd + "@" + siteandpath #set as env var os.environ["RTUSER"] = username os.environ["RTSERVER"] = BIBCATALOG_RT_SERVER passwd = escape_shell_arg(passwd) error_code, myout, dummyerr = run_shell_command("echo "+passwd+" | " + command) if error_code > 0: raise ValueError, 'Problem running "%s": %d' % (command, error_code) return myout
def get_list_of_all_matching_files(basedir, filetypes): """ This function uses the os module in order tocrawl through the directory tree rooted at basedir and find all the files therein that include filetype in their 'file' output. Returns a list of absolute paths to all files. @param: basedir (string): the directory where we want to start crawling @param: filetypes ([string, string]): something that will be contained in the output of running 'file' on the types of files we're looking for @return: file_paths ([string, string, ...]): a list of full paths to the files that we discovered """ file_paths = [] for dirpath, dummy0, filenames in os.walk(basedir): for filename in filenames: full_path = os.path.join(dirpath, filename) dummy1, cmd_out, dummy2 = run_shell_command("file %s", (full_path,)) for filetype in filetypes: if cmd_out.find(filetype) > -1: file_paths.append(full_path) return file_paths
def _normalize_article_dir_with_dtd(self, path): """ main.xml from Elsevier assume the existence of a local DTD. This procedure install the DTDs next to the main.xml file and normalize it using xmllint in order to resolve all namespaces and references. """ if exists(join(path, 'resolved_main.xml')): return if 'art520' in open(join(path, 'main.xml')).read(): ZipFile(CFG_ELSEVIER_ART520_PATH).extractall(path) for filename in listdir(join(path, 'art520')): rename(join(path, 'art520', filename), join(path, filename)) elif 'art501' in open(join(path, 'main.xml')).read(): ZipFile(CFG_ELSEVIER_ART501_PATH).extractall(path) for filename in listdir(join(path, 'art501')): rename(join(path, 'art501', filename), join(path, filename)) else: self.logger.error("It looks like the path %s does not contain an art520 or art501 main.xml file" % path) raise ValueError("It looks like the path %s does not contain an art520 or art501 main.xml file" % path) cmd_exit_code, cmd_out, cmd_err = run_shell_command("xmllint --format --loaddtd %s --output %s", (join(path, 'main.xml'), join(path, 'resolved_main.xml'))) if cmd_err: self.logger.error("Error in cleaning %s: %s" % (join(path, 'issue.xml'), cmd_err)) raise ValueError("Error in cleaning %s: %s" % (join(path, 'main.xml'), cmd_err))
def get_list_of_all_matching_files(basedir, filetypes): """ This function uses the os module in order tocrawl through the directory tree rooted at basedir and find all the files therein that include filetype in their 'file' output. Returns a list of absolute paths to all files. @param: basedir (string): the directory where we want to start crawling @param: filetypes ([string, string]): something that will be contained in the output of running 'file' on the types of files we're looking for @return: file_paths ([string, string, ...]): a list of full paths to the files that we discovered """ file_paths = [] for dirpath, dummy0, filenames in os.walk(basedir): for filename in filenames: full_path = os.path.join(dirpath, filename) dummy1, cmd_out, dummy2 = run_shell_command('file %s', (full_path,)) for filetype in filetypes: if cmd_out.find(filetype) > -1: file_paths.append(full_path) return file_paths
def _normalize_article_dir_with_dtd(self, path): """ TODO: main.xml from Springer assume the existence of a local DTD. This procedure install the DTDs next to the main.xml file and normalize it using xmllint in order to resolve all namespaces and references. """ files = [filename for filename in listdir(path) if "nlm.xml" in filename] if not files: files = [filename for filename in listdir(path) if ".xml.scoap" in filename] if exists(join(path, 'resolved_main.xml')): return if 'JATS-archivearticle1.dtd' in open(join(path, files[0])).read(): path_normalized = mkdtemp(prefix="scoap3_normalized_jats_", dir=CFG_TMPSHAREDDIR) ZipFile(CFG_SPRINGER_JATS_PATH).extractall(path_normalized) elif 'A++V2.4.dtd' in open(join(path, files[0])).read(): path_normalized = mkdtemp(prefix="scoap3_normalized_app_", dir=CFG_TMPSHAREDDIR) ZipFile(CFG_SPRINGER_AV24_PATH).extractall(path_normalized) else: self.logger.error("It looks like the path %s does not contain an JATS-archivearticle1.dtd nor A++V2.4.dtd XML file." % path) raise ValueError("It looks like the path %s does not contain an JATS-archivearticle1.dtd nor A++V2.4.dtd XML file." % path) print >> sys.stdout, "Normalizing %s" % (files[0],) cmd_exit_code, cmd_out, cmd_err = run_shell_command("xmllint --format --loaddtd %s --output %s", (join(path, files[0]), join(path_normalized, 'resolved_main.xml'))) if cmd_err: self.logger.error("Error in cleaning %s: %s" % (join(path, 'issue.xml'), cmd_err)) raise ValueError("Error in cleaning %s: %s" % (join(path, 'main.xml'), cmd_err)) self.articles_normalized.append(path_normalized)
def test_get_defaults(self): """plotextractor - get defaults""" sdir_should_be = os.path.join(CFG_TMPSHAREDDIR, self.arXiv_id + '_plots') refno_should_be = "15" # Note: For ATLANTIS DEMO site sdir, refno = get_defaults(tarball=self.tarball, sdir=None, refno_url=CFG_SITE_URL) if sdir is not None: run_shell_command("rm -rf %s" % sdir) self.assertTrue( sdir == sdir_should_be, "didn't get correct default scratch dir: expected %s found %s" % (sdir_should_be, sdir)) self.assertTrue(refno == refno_should_be, "didn't get correct default reference number")
def find_matching_files(basedir, filetypes): """ This functions tries to find all files matching given filetypes by looking at all the files and filenames in the given directory, including subdirectories. @param basedir: full path to base directory to search in @type basedir: string @param filetypes: list of filetypes, extensions @type filetypes: list @return: exitcode and any error messages as: (exitcode, err_msg) @rtype: tuple """ files_list = [] for dirpath, dummy0, filenames in os.walk(basedir): for filename in filenames: full_path = os.path.join(dirpath, filename) dummy1, cmd_out, dummy2 = run_shell_command( 'file %s', (full_path, )) for filetype in filetypes: if cmd_out.lower().find(filetype) > -1: files_list.append(full_path) elif filename.split('.')[-1].lower() == filetype: files_list.append(full_path) return files_list
def rotate_image(filename, line, sdir, image_list): ''' Given a filename and a line, figure out what it is that the author wanted to do wrt changing the rotation of the image and convert the file so that this rotation is reflected in its presentation. @param: filename (string): the name of the file as specified in the TeX @param: line (string): the line where the rotate command was found @output: the image file rotated in accordance with the rotate command @return: True if something was rotated ''' file_loc = get_image_location(filename, sdir, image_list) degrees = re.findall('(angle=[-\\d]+|rotate=[-\\d]+)', line) if len(degrees) < 1: return False degrees = degrees[0].split('=')[-1].strip() if file_loc == None or file_loc == 'ERROR' or\ not re.match('-*\\d+', degrees): return False degrees = str(0-int(degrees)) cmd = 'mogrify -rotate ' + degrees + ' ' + file_loc dummy, dummy, cmd_err = run_shell_command(cmd) if cmd_err != '': return True else: return True
def _dump_database(dirname, filename): """ Dump Invenio database into SQL file called FILENAME living in DIRNAME. """ write_message("... writing %s" % dirname + os.sep + filename) cmd = CFG_PATH_MYSQL + 'dump' if not os.path.exists(cmd): msg = "ERROR: cannot find %s." % cmd write_message(msg, stream=sys.stderr) raise StandardError(msg) cmd += " --skip-opt --add-drop-table --add-locks --create-options " \ " --quick --extended-insert --set-charset --disable-keys " \ " --host=%s --user=%s --password=%s %s | %s -c " % \ (escape_shell_arg(CFG_DATABASE_HOST), escape_shell_arg(CFG_DATABASE_USER), escape_shell_arg(CFG_DATABASE_PASS), escape_shell_arg(CFG_DATABASE_NAME), CFG_PATH_GZIP) dummy1, dummy2, dummy3 = run_shell_command(cmd, None, dirname + os.sep + filename) if dummy1: msg = "ERROR: mysqldump exit code is %s." % repr(dummy1) write_message(msg, stream=sys.stderr) raise StandardError(msg) if dummy2: msg = "ERROR: mysqldump stdout is %s." % repr(dummy1) write_message(msg, stream=sys.stderr) raise StandardError(msg) if dummy3: msg = "ERROR: mysqldump stderr is %s." % repr(dummy1) write_message(msg, stream=sys.stderr) raise StandardError(msg)
def find_matching_files(basedir, filetypes): """ This functions tries to find all files matching given filetypes by looking at all the files and filenames in the given directory, including subdirectories. @param basedir: full path to base directory to search in @type basedir: string @param filetypes: list of filetypes, extensions @type filetypes: list @return: exitcode and any error messages as: (exitcode, err_msg) @rtype: tuple """ files_list = [] for dirpath, dummy0, filenames in os.walk(basedir): for filename in filenames: full_path = os.path.join(dirpath, filename) dummy1, cmd_out, dummy2 = run_shell_command('file %s', (full_path,)) for filetype in filetypes: if cmd_out.lower().find(filetype) > -1: files_list.append(full_path) elif filename.split('.')[-1].lower() == filetype: files_list.append(full_path) return files_list
def test_insert_with_nonce(self): """batchuploader - robotupload insert with nonce""" result = urllib2.urlopen(self.req_nonce).read() self.failUnless("[INFO]" in result, '"%s" did not contained "[INFO]"' % result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless("results" in results, '"%s" did not contained "results" key' % results) self.assertEqual(len(results["results"]), 1) self.assertEqual(results["nonce"], "1234") self.failUnless(results["results"][0]["success"]) self.failUnless(results["results"][0]["recid"] > 0) self.failUnless( """<subfield code="a">Doe, John</subfield>""" in results["results"][0]["marcxml"], results["results"][0]["marcxml"], )
def convert_xml_using_saxon(source_file, template_file): """ Tries to convert given source file (full path) using XSLT 2.0 Java libraries. Looks for given XSLT stylesheet/template file (relative path) in CFG_BIBCONVERT_XSL_PATH. Path to converted file is derived from DOI in the same directory as source as decided inside the template file. For example: /path/to/sourcedir/10.1103_PhysRevA.87.052320.xml @raise: APSHarvesterConversionError if Java saxon9he-xslt returns error. @return: True on success. """ if not os.path.isabs(template_file): template_file = CFG_BIBCONVERT_XSL_PATH + os.sep + template_file source_directory = os.path.dirname(source_file) command = "cd %s && saxon9he-xslt -s:%s -xsl:%s -dtd:off" % \ (source_directory, source_file, template_file) exit_code, stdout_buffer, stderr_buffer = run_shell_command(cmd=command) if exit_code or stdout_buffer or stderr_buffer: # Error may have happened raise APSHarvesterConversionError( "%s: %s\nOut:%s" % (exit_code, stderr_buffer, stdout_buffer))
def test_run_cmd_hello_quote(self): """shellutils - running simple command with an argument with quote""" self.assertEqual((0, "hel'lo world\n", ''), run_shell_command("echo %s %s", ( "hel'lo", "world", )))
def extract_plots_from_latex_and_pdf(url_tarball, url_pdf): tarball = download_url(url_tarball) path, name = os.path.split(url_tarball) #run_shell_command('cp ' + tarball + ' ' + tarball + '_arxiv' + name) tarball_with_arxiv_extension = tarball + '_arxiv' + name os.rename(tarball, tarball_with_arxiv_extension) run_shell_command(CFG_BINDIR + '/plotextractor -t ' + tarball_with_arxiv_extension) pdf = download_url(url_pdf) run_shell_command(CFG_PDFPLOTEXTRACTOR_PATH + ' ' + pdf) path, name = os.path.split(tarball_with_arxiv_extension) plotextracted_xml_path = tarball_with_arxiv_extension + '_plots/' + name + '.xml' plotextracted_pdf_path = pdf + '.extracted/extracted.json' return plotextracted_xml_path, plotextracted_pdf_path
def ticket_submit(self, uid=None, subject="", recordid=-1, text="", queue="", priority="", owner="", requestor=""): """creates a ticket. return ticket num on success, otherwise None""" if not CFG_BIBCATALOG_SYSTEM_RT_URL: return None if uid: username, passwd = get_bibcat_from_prefs(uid) else: username = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER passwd = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//") BIBCATALOG_RT_SERVER = httppart + "//" + username + ":" + passwd + "@" + siteandpath #set as env var os.environ["RTUSER"] = username os.environ["RTSERVER"] = BIBCATALOG_RT_SERVER queueset = "" textset = "" priorityset = "" ownerset = "" subjectset = "" requestorset = "" if subject: subjectset = " subject=" + escape_shell_arg(subject) recidset = " CF-RecordID=" + escape_shell_arg(str(recordid)) if text: textset = " text=" + escape_shell_arg(text) if priority: priorityset = " priority=" + escape_shell_arg(str(priority)) if queue: queueset = " queue=" + escape_shell_arg(queue) if requestor: requestorset = " requestor=" + escape_shell_arg(requestor) if owner: #get the owner name from prefs ownerprefs = invenio.webuser.get_user_preferences(owner) if ownerprefs.has_key("bibcatalog_username"): owner = ownerprefs["bibcatalog_username"] ownerset = " owner=" + escape_shell_arg(owner) #make a command.. note that all set 'set' parts have been escaped command = CFG_BIBCATALOG_SYSTEM_RT_CLI + " create -t ticket set " + subjectset + recidset + \ queueset + textset + priorityset + ownerset + requestorset passwd = escape_shell_arg(passwd) #make a call.. passwd and command have been escaped (see above) dummy, myout, dummyerr = run_shell_command("echo "+passwd+" | " + command) inum = -1 for line in myout.split("\n"): if line.count(' ') > 0: stuff = line.split(' ') try: inum = int(stuff[2]) except: pass if inum > 0: return inum return None
def test_insert_via_curl(self): """batchuploader - robotupload insert via CLI curl""" curl_input_file = os.path.join(CFG_TMPDIR, 'curl_test.xml') open(curl_input_file, "w").write(self.marcxml) try: result = run_shell_command('/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"', [curl_input_file, self.nonce_url, make_user_agent_string('BatchUploader')])[1] self.failUnless("[INFO]" in result) current_task = get_last_taskid() run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)]) results = json.loads(open(self.callback_result_path).read()) self.failUnless('results' in results, '"%s" did not contained [INFO]' % result) self.assertEqual(len(results['results']), 1) self.assertEqual(results['nonce'], "1234") self.failUnless(results['results'][0]['success']) self.failUnless(results['results'][0]['recid'] > 0) self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml']) finally: os.remove(curl_input_file)
def ticket_submit(self, uid, subject, recordid, text="", queue="", priority="", owner=""): """creates a ticket. return ticket num on success, otherwise None""" if not CFG_BIBCATALOG_SYSTEM_RT_URL: return None (username, passwd) = get_bibcat_from_prefs(uid) httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//") BIBCATALOG_RT_SERVER = httppart + "//" + username + ":" + passwd + "@" + siteandpath #set as env var os.environ["RTUSER"] = username os.environ["RTSERVER"] = BIBCATALOG_RT_SERVER queueset = "" textset = "" priorityset = "" ownerset = "" subjectset = "" if subject: subjectset = " subject=" + escape_shell_arg(subject) recidset = " CF-RecordID=" + escape_shell_arg(str(recordid)) if text: textset = " text=" + escape_shell_arg(text) if priority: priorityset = " priority=" + escape_shell_arg(str(priority)) if queue: queueset = " queue=" + escape_shell_arg(queue) if owner: #get the owner name from prefs ownerprefs = invenio.webuser.get_user_preferences(owner) if ownerprefs.has_key("bibcatalog_username"): owner = ownerprefs["bibcatalog_username"] ownerset = " owner=" + escape_shell_arg(owner) #make a command.. note that all set 'set' parts have been escaped command = CFG_BIBCATALOG_SYSTEM_RT_CLI + " create -t ticket set " + subjectset + recidset + \ queueset + textset + priorityset + ownerset passwd = escape_shell_arg(passwd) #make a call.. passwd and command have been escaped (see above) dummy, myout, dummyerr = run_shell_command("echo " + passwd + " | " + command) inum = -1 for line in myout.split("\n"): if line.count(' ') > 0: stuff = line.split(' ') try: inum = int(stuff[2]) except: pass if inum > 0: return inum return None
def get_my_pid(process, args=''): if sys.platform.startswith('freebsd'): command = "ps -o pid,args | grep '%s %s' | grep -v 'grep' | sed -n 1p" % (process, args) else: command = "ps -C %s o '%%p%%a' | grep '%s %s' | grep -v 'grep' | sed -n 1p" % (process, process, args) answer = run_shell_command(command)[1].strip() if answer == '': answer = 0 else: answer = answer[:answer.find(' ')] return int(answer)
def load_submission(doctype, dump, method=None): "Insert submission into database. Return tuple(error code, msg)" # NOT TESTED messages = [] def guess_dump_method(dump): """Guess which method was used to dump this file (i.e. if it contains all the submission rows or not)""" match_obj = re_method_pattern.search(dump) if match_obj: return match_obj.group('method') else: return None def guess_dump_has_delete_statements(dump): """Guess if given submission dump already contain delete statements""" return "DELETE FROM sbmDOCTYPE WHERE sdocname".lower() in dump.lower() if not method: method = guess_dump_method(dump) if method is None: method = CFG_WEBSUBMIT_DUMPER_DEFAULT_METHOD messages.append( "WARNING: method could not be guessed. Using method %s" % method) else: messages.append("Used method %s to load data" % method) (dump_code, dump_path) = tempfile.mkstemp(prefix=doctype, dir=CFG_TMPDIR) dump_fd = open(dump_path, 'w') dump_fd.write(dump) dump_fd.close() # We need to remove the submission. But let's create a backup first. submission_backup = dump_submission(doctype, method) submission_backup_path = "%s_db_dump%s.sql" % ( doctype, time.strftime("%Y%m%d_%H%M%S")) fd = file(os.path.join(CFG_TMPDIR, submission_backup_path), "w") fd.write(submission_backup) fd.close() if not guess_dump_has_delete_statements(dump): remove_submission(doctype, method) # Load the dump (exit_code, out_msg, err_msg) = run_shell_command("%s/bin/dbexec < %s", (CFG_PREFIX, os.path.abspath(dump_path))) if exit_code: messages.append("ERROR: failed to load submission:" + err_msg) return (1, messages) messages.append("Submission loaded. Previous submission saved to %s" % os.path.join(CFG_TMPDIR, submission_backup_path)) return (0, messages)
def get_text_snippets(textfile_path, patterns, nb_chars, max_snippets): """ Extract text snippets around 'patterns' from the file found at 'textfile_path'. The snippets are meant to look similar to results of popular Internet search engines: using " ... " between snippets. For empty patterns it returns "" """ """ TODO: - distinguish the beginning of sentences and make the snippets start there - optimize finding patterns - first search for patterns apperaing next to each other, secondly look for each patten not for first occurances of any pattern """ if len(patterns) == 0: return "" max_lines = nb_chars / 40 + 2 # rule of thumb in order to catch nb_chars # Produce the big snippets from which the real snippets will be cut out cmd = "grep -i -C%s -m%s" cmdargs = [str(max_lines), str(max_snippets)] for p in patterns: cmd += " -e %s" cmdargs.append(" " + p) cmd += " %s" cmdargs.append(textfile_path) (dummy1, output, dummy2) = run_shell_command(cmd, cmdargs) # a fact to keep in mind with this call to grep is that if patterns appear # in two contigious lines, they will not be separated by '--' and therefore # treated as one 'big snippet' result = [] big_snippets = output.split("--") # cut the snippets to match the nb_words_around parameter precisely: for s in big_snippets: small_snippet = cut_out_snippet(s, patterns, nb_chars) result.append(small_snippet) # combine snippets out = "" count = 0 for snippet in result: if snippet and count < max_snippets: if out: out += "..." out += highlight(snippet, patterns, whole_word_matches=True) return out
def prettify_xml(filepath): """ Will prettify an XML file for better readability. Returns the new, pretty, file. """ new_filename = "%s_pretty.xml" % (os.path.splitext(filepath)[0],) cmd = "xmllint --format %s" % (filepath,) exit_code, std_out, err_msg = run_shell_command(cmd=cmd, filename_out=new_filename) if exit_code: write_message("\nError caught: %s" % (err_msg,)) task_update_status("CERROR") return return new_filename
def version(separator='\n'): """ Try to detect Apache version by localizing httpd or apache executables and grepping inside binaries. Return list of all found Apache versions and paths. (For a given executable, the returned format is 'apache_version [apache_path]'.) Return empty list if no success. """ from invenio.inveniocfg import _grep_version_from_executable from invenio.shellutils import run_shell_command out = [] dummy1, cmd_out, dummy2 = run_shell_command("locate bin/httpd bin/apache") for apache in cmd_out.split("\n"): apache_version = _grep_version_from_executable(apache, '^Apache\/') if apache_version: out.append("%s [%s]" % (apache_version, apache)) return separator.join(out)
def task_run_core(recid): pdf = look_for_fulltext(recid) write_message('pdf: %s' % pdf) if pdf: tmpfd, tmppath = mkstemp(prefix="plotextractor-", suffix=".pdf") try: # tmpfd is being closed by copyfileobj copyfileobj(open(pdf), os.fdopen(tmpfd,'w')) (exit_code, output_buffer,stderr_output_buffer) = run_shell_command(CFG_PDFPLOTEXTRACTOR_PATH + ' ' + tmppath) plotextracted_pdf_path = tmppath + ".extracted/extracted.json" code, output_vector, extracted = merging_articles(None, plotextracted_pdf_path) finally: os.remove(tmppath) try: id_fulltext = get_fieldvalues([recid], "037_a")[0] except IndexError: id_fulltext = "" create_MARCXML(output_vector, id_fulltext, code, extracted, write_file=True)
def convert_images(image_list): """ Here we figure out the types of the images that were extracted from the tarball and determine how to convert them into PNG. @param: image_list ([string, string, ...]): the list of image files extracted from the tarball in step 1 @return: image_list ([str, str, ...]): The list of image files when all have been converted to PNG format. """ png_output_contains = 'PNG image' ret_list = [] for image_file in image_list: if os.path.isdir(image_file): continue # FIXME: here and everywhere else in the plot extractor # library the run shell command statements should be (1) # called with timeout in order to prevent runaway imagemagick # conversions; (2) the arguments should be passed properly so # that they are escaped. dummy1, cmd_out, dummy2 = run_shell_command('file %s', (image_file,)) if cmd_out.find(png_output_contains) > -1: ret_list.append(image_file) else: # we're just going to assume that ImageMagick can convert all # the image types that we may be faced with # for sure it can do EPS->PNG and JPG->PNG and PS->PNG # and PSTEX->PNG converted_image_file = get_converted_image_name(image_file) try: dummy1, cmd_out, cmd_err = run_process_with_timeout('convert %s %s'\ % (image_file, \ converted_image_file), shell = True) if cmd_err == '': ret_list.append(converted_image_file) else: write_message('convert failed on ' + image_file) except Timeout: write_message('convert timed out on ' + image_file) return ret_list
def process_pdf(pdf, id): write_message("process pdf") (exit_code, output_buffer, stderr_output_buffer) = run_shell_command(CFG_PDFPLOTEXTRACTOR_PATH + ' ' + pdf) plotextracted_pdf_path = pdf + ".extracted/extracted.json" (code, message, dummy, list_of_figures_from_pdf) = getFigureVectors('', plotextracted_pdf_path) extracted = pdf + ".extracted" # Create MARCXML from json file # @param extracted - output file with the MARCXML marc_path = create_MARCXML(list_of_figures_from_pdf, id, code, extracted, True) write_message("end process pdf") now = datetime.datetime.now() stderr_output_buffer = "[" + str( now ) + "]: The Pdf extractor for the file " + pdf + " has an error. The traceback:\n" + stderr_output_buffer return (exit_code, stderr_output_buffer, plotextracted_pdf_path, marc_path)
def read_metadata_local(inputfile, verbose): """ Metadata extraction from many kind of files @param inputfile: path to the image @type inputfile: string @param verbose: verbosity @type verbose: int @rtype: dict @return: dictionary with metadata """ cmd = CFG_PATH_PDFTK + ' %s dump_data' (exit_status, output_std, output_err) = \ run_shell_command(cmd, args=(inputfile,)) metadata_dict = {} key = None value = None for metadata_line in output_std.splitlines(): if metadata_line.strip().startswith("InfoKey"): key = metadata_line.split(':', 1)[1].strip() elif metadata_line.strip().startswith("InfoValue"): value = metadata_line.split(':', 1)[1].strip() if key in ["ModDate", "CreationDate"]: # FIXME: Interpret these dates? try: pass #value = datetime.strptime(value, "D:%Y%m%d%H%M%S%Z") except: pass if key: metadata_dict[key] = value key = None else: try: custom_key, custom_value = metadata_line.split(':', 1) metadata_dict[custom_key.strip()] = custom_value.strip() except: # Most probably not relevant line pass return metadata_dict