def test_insert_via_curl(self):
     """batchuploader - robotupload insert via CLI curl"""
     curl_input_file = os.path.join(CFG_TMPDIR, 'curl_test.xml')
     open(curl_input_file, "w").write(self.marcxml)
     try:
         result = run_shell_command(
             '/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"',
             [
                 curl_input_file, self.nonce_url,
                 make_user_agent_string('BatchUploader')
             ])[1]
         self.failUnless("[INFO]" in result)
         current_task = get_last_taskid()
         run_shell_command("%s/bibupload %%s" % CFG_BINDIR,
                           [str(current_task)])
         results = json.loads(
             open(self.callback_result_path).read())
         self.failUnless('results' in results,
                         '"%s" did not contained [INFO]' % result)
         self.assertEqual(len(results['results']), 1)
         self.assertEqual(results['nonce'], "1234")
         self.failUnless(results['results'][0]['success'])
         self.failUnless(results['results'][0]['recid'] > 0)
         self.failUnless(
             """<subfield code="a">Doe, John</subfield>"""
             in results['results'][0]['marcxml'],
             results['results'][0]['marcxml'])
     finally:
         os.remove(curl_input_file)
 def test_legacy_insert_via_curl(self):
     """batchuploader - robotupload legacy insert via CLI curl"""
     curl_input_file = os.path.join(CFG_TMPDIR, 'curl_test.xml')
     open(curl_input_file, "w").write(self.marcxml)
     try:
         ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload
         code, result, err = run_shell_command(
             "/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s",
             [
                 curl_input_file, self.callback_url,
                 self.legacy_url,
                 make_user_agent_string('BatchUploader')
             ])
         self.failUnless(
             "[INFO]" in result,
             '[INFO] not find in results: %s, %s' % (result, err))
         current_task = get_last_taskid()
         run_shell_command("%s/bibupload %%s" % CFG_BINDIR,
                           [str(current_task)])
         results = json.loads(
             open(self.callback_result_path).read())
         self.failUnless('results' in results,
                         '"%s" did not contained [INFO]' % result)
         self.assertEqual(len(results['results']), 1)
         self.assertEqual(results['nonce'], "1234")
         self.failUnless(results['results'][0]['success'])
         self.failUnless(results['results'][0]['recid'] > 0)
         self.failUnless(
             """<subfield code="a">Doe, John</subfield>"""
             in results['results'][0]['marcxml'],
             results['results'][0]['marcxml'])
     finally:
         os.remove(curl_input_file)
 def test_legacy_insert_via_curl(self):
     """batchuploader - robotupload legacy insert via CLI curl"""
     curl_input_file = os.path.join(CFG_TMPDIR, "curl_test.xml")
     open(curl_input_file, "w").write(self.marcxml)
     try:
         ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload
         code, result, err = run_shell_command(
             "/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s",
             [curl_input_file, self.callback_url, self.legacy_url, make_user_agent_string("BatchUploader")],
         )
         self.failUnless("[INFO]" in result, "[INFO] not find in results: %s, %s" % (result, err))
         current_task = get_last_taskid()
         run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)])
         results = json.loads(open(self.callback_result_path).read())
         self.failUnless("results" in results, '"%s" did not contained [INFO]' % result)
         self.assertEqual(len(results["results"]), 1)
         self.assertEqual(results["nonce"], "1234")
         self.failUnless(results["results"][0]["success"])
         self.failUnless(results["results"][0]["recid"] > 0)
         self.failUnless(
             """<subfield code="a">Doe, John</subfield>""" in results["results"][0]["marcxml"],
             results["results"][0]["marcxml"],
         )
     finally:
         os.remove(curl_input_file)
 def test_get_defaults(self):
     """plotextractor - get defaults"""
     sdir_should_be = os.path.join(CFG_TMPDIR, self.arXiv_id + "_plots")
     refno_should_be = "15"  # Note: For ATLANTIS DEMO site
     sdir, refno = get_defaults(tarball=self.tarball, sdir=None, refno_url=CFG_SITE_URL)
     if sdir != None:
         run_shell_command("rm -rf %s" % (sdir,))
     self.assertTrue(sdir == sdir_should_be, "didn't get correct default scratch dir")
     self.assertTrue(refno == refno_should_be, "didn't get correct default reference number")
Пример #5
0
def process_single(tarball, sdir=CFG_TMPDIR, xtract_text=False, \
                   upload_plots=True, force=False, squash=False):
    '''
    Processes one tarball end-to-end.

    @param: tarball (string): the absolute location of the tarball we wish
        to process
    @param: sdir (string): where we should put all the intermediate files for
        the processing.  if you're uploading, this directory should be one
        of the ones specified in CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS, else
        the upload won't work
    @param: xtract_text (boolean): true iff you want to run pdftotext on the
        pdf versions of the tarfiles.  this programme assumes that the pdfs
        are named the same as the tarballs but with a .pdf extension.
    @param: upload_plots (boolean): true iff you want to bibupload the plots
        extracted by this process

    @return: None
    '''

    sub_dir, refno = get_defaults(tarball, sdir)

    if not squash:
        marc_name = os.path.join(sub_dir, refno + '.xml')
    else:
        marc_name = os.path.join(sdir, SQUASHED_FILE)

    if (force or not os.path.exists(marc_name)) and not squash:
        open(marc_name, 'w').close()

    if xtract_text:
        extract_text(tarball)

    image_list, tex_files = untar(tarball, sub_dir)
    if tex_files == [] or tex_files == None:
        write_message(os.path.split(tarball)[-1] + ' is not a tarball')
        run_shell_command('rm -r ' + sub_dir)
        return

    converted_image_list = convert_images(image_list)

    images_and_captions_and_labels = [['','', []]]
    for tex_file in tex_files:
        images_and_captions_and_labels.extend(extract_captions(tex_file,
                                               sub_dir, converted_image_list))

    marc_name = create_MARC(images_and_captions_and_labels, tex_files[0],
                            refno, converted_image_list, marc_name)
    if marc_name != None and not squash:
        write_message('generated ' + marc_name)
        if upload_plots:
            upload_to_site(marc_name)

    clean_up(image_list)

    write_message('work complete on ' + os.path.split(tarball)[-1])
Пример #6
0
def extract_plots_from_latex(url_tarball):
    tarball = download_url(url_tarball)
    path, name = os.path.split(url_tarball)
    tarball += name

    run_shell_command(CFG_BINDIR + '/plotextractor -t ' + tarball)

    path, name = os.path.split(tarball)
    plotextracted_xml_path = tarball + '_plots/' + name + '.xml'
    return plotextracted_xml_path
 def test_simple_insert(self):
     """batchuploader - robotupload simple insert"""
     result = urllib2.urlopen(self.req).read()
     self.failUnless("[INFO]" in result)
     current_task = get_last_taskid()
     run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)])
     current_recid = run_sql("SELECT MAX(id) FROM bibrec")[0][0]
     self.failIfEqual(self.last_recid, current_recid)
     record = get_record(current_recid)
     self.assertEqual(record['245'][0][0], [('a', 'The title')])
Пример #8
0
    def check_system(self, uid=None):
        """return an error string if there are problems"""
        if uid:
            rtuid, rtpw = get_bibcat_from_prefs(uid)
        else:
            # Assume default RT user
            rtuid = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER
            rtpw = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD

        if not rtuid and not rtpw:
            return "No valid RT user login specified"

        if not CFG_BIBCATALOG_SYSTEM == 'RT':
            return "CFG_BIBCATALOG_SYSTEM is not RT though this is an RT module"
        if not CFG_BIBCATALOG_SYSTEM_RT_CLI:
            return "CFG_BIBCATALOG_SYSTEM_RT_CLI not defined or empty"
        if not os.path.exists(CFG_BIBCATALOG_SYSTEM_RT_CLI):
            return "CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " file does not exists"

        # Check that you can execute the binary.. this is a safe call unless someone can fake CFG_BIBCATALOG_SYSTEM_RT_CLI (unlikely)
        dummy, myout, myerr = run_shell_command(CFG_BIBCATALOG_SYSTEM_RT_CLI +
                                                " help")
        helpfound = False
        if myerr.count("help") > 0:
            helpfound = True
        if not helpfound:
            return "Execution of CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " help did not produce output 'help'"

        if not CFG_BIBCATALOG_SYSTEM_RT_URL:
            return "CFG_BIBCATALOG_SYSTEM_RT_URL not defined or empty"
        # Construct URL, split RT_URL at //
        if not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('http://') and \
           not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('https://'):
            return "CFG_BIBCATALOG__SYSTEM_RT_URL does not start with 'http://' or 'https://'"
        httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//")
        # Assemble by http://user:password@RT_URL
        bibcatalog_rt_server = httppart + "//" + rtuid + ":" + rtpw + "@" + siteandpath

        #set as env var
        os.environ["RTUSER"] = rtuid
        os.environ["RTSERVER"] = bibcatalog_rt_server

        #try to talk to RT server
        #this is a safe call since rtpw is the only variable in it, and it is escaped
        rtpw = escape_shell_arg(rtpw)
        dummy, myout, myerr = run_shell_command("echo " + rtpw + " | " +
                                                CFG_BIBCATALOG_SYSTEM_RT_CLI +
                                                " ls \"Subject like 'F00'\"")
        if len(myerr) > 0:
            return "could not connect to " + bibcatalog_rt_server + " " + myerr
        #finally, check that there is some sane output like tickets or 'No matching result'
        saneoutput = (myout.count('matching') > 0) or (myout.count('1') > 0)
        if not saneoutput:
            return CFG_BIBCATALOG_SYSTEM_RT_CLI + " returned " + myout + " instead of 'matching' or '1'"
        return ""
Пример #9
0
    def test_get_defaults(self):
        sdir = None
        sdir_should_be = os.path.join(CFG_TMPDIR, self.arXiv_id + '_plots')

        sdir, refno = get_defaults(self.tarball, sdir, "http://inspirebeta.net")
        if sdir != None:
            run_shell_command("rm -rf %s" % (sdir,))
        self.assertTrue(sdir == sdir_should_be, \
                         "didn\'t get correct default scratch dir")
        self.assertTrue(refno == "812227", \
                         'didn\'t get correct default reference number')
 def test_simple_insert(self):
     """batchuploader - robotupload simple insert"""
     result = urllib2.urlopen(self.req).read()
     self.failUnless("[INFO]" in result)
     current_task = get_last_taskid()
     run_shell_command("%s/bibupload %%s" % CFG_BINDIR,
                       [str(current_task)])
     current_recid = run_sql("SELECT MAX(id) FROM bibrec")[0][0]
     self.failIfEqual(self.last_recid, current_recid)
     record = get_record(current_recid)
     self.assertEqual(record['245'][0][0], [('a', 'The title')])
Пример #11
0
def clean_up(image_list):
    '''
    Removes all the intermediate stuff.

    @param: image_list ([string, string, ...]): the images to remove

    NOTE: when running this for later upload, it's not a good idea to
        remove the converted images!
    '''
    return # FIXME do not delete image files before upload
    for image_file in image_list:
        run_shell_command('rm ' + image_file)
    def test_get_defaults(self):
        """plotextractor - get defaults"""
        sdir = None
        sdir_should_be = os.path.join(CFG_TMPDIR, self.arXiv_id + '_plots')

        sdir, refno = get_defaults(self.tarball, sdir, "http://inspirehep.net")
        if sdir != None:
            run_shell_command("rm -rf %s" % (sdir, ))
        self.assertTrue(sdir == sdir_should_be, \
                         "didn\'t get correct default scratch dir")
        self.assertTrue(refno == "812227", \
                         'didn\'t get correct default reference number')
 def test_insert_with_callback(self):
     """batchuploader - robotupload insert with callback"""
     result = urllib2.urlopen(self.req_callback).read()
     self.failUnless("[INFO]" in result, '"%s" did not contained [INFO]' % result)
     current_task = get_last_taskid()
     run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)])
     results = json.loads(open(self.callback_result_path).read())
     self.failUnless('results' in results)
     self.assertEqual(len(results['results']), 1)
     self.failUnless(results['results'][0]['success'])
     self.failUnless(results['results'][0]['recid'] > 0)
     self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
Пример #14
0
def clean_up(extracted_files_list, image_list):
    """
    Removes all the intermediate stuff.

    @param: extracted_files_list ([string, string, ...]): list of all extracted files
    @param: image_list ([string, string, ...]): list of the images to keep

    """
    for extracted_file in extracted_files_list:
        # Remove everything that is not in the image_list or is not a directory
        if extracted_file not in image_list and extracted_file[-1] != os.sep:
            run_shell_command('rm %s', (extracted_file,))
 def check_system(self, uid):
     """return an error string if there are problems"""
     user_pref = invenio.webuser.get_user_preferences(uid)
     if not user_pref.has_key('bibcatalog_username'):
         return "user " + str(uid) + " has no bibcatalog_username"
     rtuid = user_pref['bibcatalog_username']
     if not user_pref.has_key('bibcatalog_password'):
         return "user " + str(uid) + " has no bibcatalog_password"
     rtpw = user_pref['bibcatalog_password']
     if not CFG_BIBCATALOG_SYSTEM == 'RT':
         return "CFG_BIBCATALOG_SYSTEM is not RT though this is an RT module"
     if not CFG_BIBCATALOG_SYSTEM_RT_CLI:
         return "CFG_BIBCATALOG_SYSTEM_RT_CLI not defined or empty"
     if not os.path.exists(CFG_BIBCATALOG_SYSTEM_RT_CLI):
         return "CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " file does not exists"
     #check that you can execute it.. this is a safe call unless someone can fake CFG_BIBCATALOG_SYSTEM_RT_CLI (unlikely)
     dummy, myout, myerr = run_shell_command(CFG_BIBCATALOG_SYSTEM_RT_CLI +
                                             " help")
     helpfound = False
     if myerr.count("help") > 0:
         helpfound = True
     if not helpfound:
         return "Execution of CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " help did not produce output 'help'"
     if not CFG_BIBCATALOG_SYSTEM_RT_URL:
         return "CFG_BIBCATALOG_SYSTEM_RT_URL not defined or empty"
     #construct.. split RT_URL at //
     if not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('http://') and \
        not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('https://'):
         return "CFG_BIBCATALOG__SYSTEM_RT_URL does not start with 'http://' or 'https://'"
     httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//")
     BIBCATALOG_RT_SERVER = httppart + "//" + rtuid + ":" + rtpw + "@" + siteandpath
     #set as env var
     os.environ["RTUSER"] = rtuid
     os.environ["RTSERVER"] = BIBCATALOG_RT_SERVER
     #try to talk to RT server
     #this is a safe call since rtpw is the only variable in it, and it is escaped
     rtpw = escape_shell_arg(rtpw)
     dummy, myout, myerr = run_shell_command("echo " + rtpw + " | " +
                                             CFG_BIBCATALOG_SYSTEM_RT_CLI +
                                             " ls \"Subject like 'F00'\"")
     if len(myerr) > 0:
         return "could not connect to " + BIBCATALOG_RT_SERVER + " " + myerr
     #finally, check that there is some sane output like tickets or 'No matching result'
     saneoutput = (myout.count('matching') > 0) or (myout.count('1') > 0)
     if not saneoutput:
         return CFG_BIBCATALOG_SYSTEM_RT_CLI + " returned " + myout + " instead of 'matching' or '1'"
     if not CFG_BIBCATALOG_QUEUES:
         return "CFG_BIBCATALOG_QUEUES not defined or empty"
     (username, dummy) = get_bibcat_from_prefs(uid)
     if (username is None):
         return "Cannot find user preference bibcatalog_username for uid " + str(
             uid)
     return ""
 def test_insert_with_callback(self):
     """batchuploader - robotupload insert with callback"""
     result = urllib2.urlopen(self.req_callback).read()
     self.failUnless("[INFO]" in result, '"%s" did not contained [INFO]' % result)
     current_task = get_last_taskid()
     run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)])
     results = json.loads(open(self.callback_result_path).read())
     self.failUnless('results' in results)
     self.assertEqual(len(results['results']), 1)
     self.failUnless(results['results'][0]['success'])
     self.failUnless(results['results'][0]['recid'] > 0)
     self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
Пример #17
0
    def check_system(self, uid=None):
        """return an error string if there are problems"""
        if uid:
            rtuid, rtpw = get_bibcat_from_prefs(uid)
        else:
            # Assume default RT user
            rtuid = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER
            rtpw = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD

        if not rtuid and not rtpw:
            return "No valid RT user login specified"

        if not CFG_BIBCATALOG_SYSTEM == 'RT':
            return "CFG_BIBCATALOG_SYSTEM is not RT though this is an RT module"
        if not CFG_BIBCATALOG_SYSTEM_RT_CLI:
            return "CFG_BIBCATALOG_SYSTEM_RT_CLI not defined or empty"
        if not os.path.exists(CFG_BIBCATALOG_SYSTEM_RT_CLI):
            return "CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " file does not exists"

        # Check that you can execute the binary.. this is a safe call unless someone can fake CFG_BIBCATALOG_SYSTEM_RT_CLI (unlikely)
        dummy, myout, myerr = run_shell_command(CFG_BIBCATALOG_SYSTEM_RT_CLI + " help")
        helpfound = False
        if myerr.count("help") > 0:
            helpfound = True
        if not helpfound:
            return "Execution of CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " help did not produce output 'help'"

        if not CFG_BIBCATALOG_SYSTEM_RT_URL:
            return "CFG_BIBCATALOG_SYSTEM_RT_URL not defined or empty"
        # Construct URL, split RT_URL at //
        if not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('http://') and \
           not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('https://'):
            return "CFG_BIBCATALOG__SYSTEM_RT_URL does not start with 'http://' or 'https://'"
        httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//")
        # Assemble by http://user:password@RT_URL
        bibcatalog_rt_server = httppart + "//" + rtuid + ":" + rtpw + "@" + siteandpath

        #set as env var
        os.environ["RTUSER"] = rtuid
        os.environ["RTSERVER"] = bibcatalog_rt_server

        #try to talk to RT server
        #this is a safe call since rtpw is the only variable in it, and it is escaped
        rtpw = escape_shell_arg(rtpw)
        dummy, myout, myerr = run_shell_command("echo "+rtpw+" | " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " ls \"Subject like 'F00'\"")
        if len(myerr) > 0:
            return "could not connect to " + bibcatalog_rt_server + " " + myerr
        #finally, check that there is some sane output like tickets or 'No matching result'
        saneoutput = (myout.count('matching') > 0) or (myout.count('1') > 0)
        if not saneoutput:
            return CFG_BIBCATALOG_SYSTEM_RT_CLI + " returned " + myout + " instead of 'matching' or '1'"
        return ""
Пример #18
0
 def test_get_defaults(self):
     """plotextractor - get defaults"""
     from invenio.shellutils import run_shell_command
     from invenio.plotextractor import get_defaults
     sdir_should_be = os.path.join(CFG_TMPDIR, self.arXiv_id + '_plots')
     refno_should_be = "15" # Note: For ATLANTIS DEMO site
     sdir, refno = get_defaults(tarball=self.tarball, sdir=None, refno_url=CFG_SITE_URL)
     if sdir != None:
         run_shell_command("rm -rf %s" % (sdir,))
     self.assertTrue(sdir == sdir_should_be, \
                      "didn\'t get correct default scratch dir")
     self.assertTrue(refno == refno_should_be, \
                      'didn\'t get correct default reference number')
Пример #19
0
def _check_enough_ram():
    """
    Return if there is enough RAM, i.e. if free outputs more than 1G of ram.
    """
    from invenio.shellutils import run_shell_command
    try:
        return int(re.sub(r'\s+', ' ', run_shell_command("free")[1].splitlines()[1]).split(' ')[1]) > 1024 * 1024
    except:
        ## Are we really on Linux? Maybe on a BSD system?
        try:
            return int(run_shell_command("sysctl -n hw.memsize")[1]) > 1024 * 1024
        except:
            # Still no luck
            return False
Пример #20
0
def _check_enough_ram():
    """
    Return if there is enough RAM, i.e. if free outputs more than 1G of ram.
    """
    from invenio.shellutils import run_shell_command
    try:
        return int(re.sub(r'\s+', ' ', run_shell_command("free")[1].splitlines()[1]).split(' ')[1]) > 1024 * 1024
    except:
        ## Are we really on Linux? Maybe on a BSD system?
        try:
            return int(run_shell_command("sysctl -n hw.memsize")[1]) > 1024 * 1024
        except:
            # Still no luck
            return False
 def check_system(self, uid):
     """return an error string if there are problems"""
     user_pref = invenio.webuser.get_user_preferences(uid)
     if not user_pref.has_key('bibcatalog_username'):
         return "user " + str(uid) + " has no bibcatalog_username"
     rtuid = user_pref['bibcatalog_username']
     if not user_pref.has_key('bibcatalog_password'):
         return "user " + str(uid) + " has no bibcatalog_password"
     rtpw = user_pref['bibcatalog_password']
     if not CFG_BIBCATALOG_SYSTEM == 'RT':
         return "CFG_BIBCATALOG_SYSTEM is not RT though this is an RT module"
     if not CFG_BIBCATALOG_SYSTEM_RT_CLI:
         return "CFG_BIBCATALOG_SYSTEM_RT_CLI not defined or empty"
     if not os.path.exists(CFG_BIBCATALOG_SYSTEM_RT_CLI):
         return "CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " file does not exists"
     #check that you can execute it.. this is a safe call unless someone can fake CFG_BIBCATALOG_SYSTEM_RT_CLI (unlikely)
     dummy, myout, myerr = run_shell_command(CFG_BIBCATALOG_SYSTEM_RT_CLI + " help")
     helpfound = False
     if myerr.count("help") > 0:
         helpfound = True
     if not helpfound:
         return "Execution of CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " help did not produce output 'help'"
     if not CFG_BIBCATALOG_SYSTEM_RT_URL:
         return "CFG_BIBCATALOG_SYSTEM_RT_URL not defined or empty"
     #construct.. split RT_URL at //
     if not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('http://') and \
        not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('https://'):
         return "CFG_BIBCATALOG__SYSTEM_RT_URL does not start with 'http://' or 'https://'"
     httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//")
     BIBCATALOG_RT_SERVER = httppart + "//" + rtuid + ":" + rtpw + "@" + siteandpath
     #set as env var
     os.environ["RTUSER"] = rtuid
     os.environ["RTSERVER"] = BIBCATALOG_RT_SERVER
     #try to talk to RT server
     #this is a safe call since rtpw is the only variable in it, and it is escaped
     rtpw = escape_shell_arg(rtpw)
     dummy, myout, myerr = run_shell_command("echo "+rtpw+" | " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " ls \"Subject like 'F00'\"")
     if len(myerr) > 0:
         return "could not connect to " + BIBCATALOG_RT_SERVER + " " + myerr
     #finally, check that there is some sane output like tickets or 'No matching result'
     saneoutput = (myout.count('matching') > 0) or (myout.count('1') > 0)
     if not saneoutput:
         return CFG_BIBCATALOG_SYSTEM_RT_CLI + " returned " + myout + " instead of 'matching' or '1'"
     if not CFG_BIBCATALOG_QUEUES:
         return "CFG_BIBCATALOG_QUEUES not defined or empty"
     (username, dummy) = get_bibcat_from_prefs(uid)
     if (username is None):
         return "Cannot find user preference bibcatalog_username for uid "+str(uid)
     return ""
 def test_insert_with_oracle(self):
     """batchuploader - robotupload insert with oracle special treatment"""
     import os
     if os.path.exists('/opt/invenio/var/log/invenio.err'):
         os.remove('/opt/invenio/var/log/invenio.err')
     result = urllib2.urlopen(self.req_oracle).read()
     self.failUnless("[INFO]" in result, '"%s" did not contained "[INFO]"' % result)
     current_task = get_last_taskid()
     run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)])
     results = json.loads(open(self.callback_result_path).read())
     self.failUnless('results' in results, '"%s" did not contained "results" key' % results)
     self.assertEqual(len(results['results']), 1)
     self.failUnless(results['results'][0]['success'])
     self.failUnless(results['results'][0]['recid'] > 0)
     self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
 def test_insert_with_oracle(self):
     """batchuploader - robotupload insert with oracle special treatment"""
     import os
     if os.path.exists('/opt/invenio/var/log/invenio.err'):
         os.remove('/opt/invenio/var/log/invenio.err')
     result = urllib2.urlopen(self.req_oracle).read()
     self.failUnless("[INFO]" in result, '"%s" did not contained "[INFO]"' % result)
     current_task = get_last_taskid()
     run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)])
     results = json.loads(open(self.callback_result_path).read())
     self.failUnless('results' in results, '"%s" did not contained "results" key' % results)
     self.assertEqual(len(results['results']), 1)
     self.failUnless(results['results'][0]['success'])
     self.failUnless(results['results'][0]['recid'] > 0)
     self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
Пример #24
0
def convert_images(image_list):
    '''
    Here we figure out the types of the images that were extracted from
    the tarball and determine how to convert them into PNG.

    @param: image_list ([string, string, ...]): the list of image files
        extracted from the tarball in step 1

    @return: image_list ([str, str, ...]): The list of image files when all
        have been converted to PNG format.
    '''

    png_output_contains = 'PNG image data'
    ps_output_contains = 'Postscript'
    eps_output_contains = 'PostScript'

    ret_list = []

    for image_file in image_list:
        if os.path.isdir(image_file):
            continue

        # FIXME: here and everywhere else in the plot extractor
        # library the run shell command statements should be (1)
        # called with timeout in order to prevent runaway imagemagick
        # conversions; (2) the arguments should be passed properly so
        # that they are escaped.

        dummy1, cmd_out, dummy2 = run_shell_command('file ' + image_file)
        if cmd_out.find(png_output_contains) > -1:
            ret_list.append(image_file)
        else:
            # we're just going to assume that ImageMagick can convert all
            # the image types that we may be faced with
            # for sure it can do EPS->PNG and JPG->PNG and PS->PNG
            # and PSTEX->PNG
            converted_image_file = get_converted_image_name(image_file)

            convert_cmd = 'convert '

            dummy1, cmd_out, cmd_err = run_shell_command(convert_cmd +\
                    image_file + ' ' + converted_image_file)
            if cmd_err == '':
                ret_list.append(converted_image_file)
            else:
                write_message('convert failed on ' + image_file)

    return ret_list
Пример #25
0
 def _normalize_article_dir_with_dtd(self, path):
     """
     main.xml from Elsevier assume the existence of a local DTD.
     This procedure install the DTDs next to the main.xml file
     and normalize it using xmllint in order to resolve all namespaces
     and references.
     """
     if exists(join(path, 'resolved_main.xml')):
         return
     if 'art520' in open(join(path, 'main.xml')).read():
         ZipFile(CFG_ELSEVIER_ART520_PATH).extractall(path)
         for filename in listdir(join(path, 'art520')):
             rename(join(path, 'art520', filename), join(path, filename))
     elif 'art501' in open(join(path, 'main.xml')).read():
         ZipFile(CFG_ELSEVIER_ART501_PATH).extractall(path)
         for filename in listdir(join(path, 'art501')):
             rename(join(path, 'art501', filename), join(path, filename))
     else:
         message = "It looks like the path " + path
         message += "does not contain an si520 or si501 main.xml file"
         self.logger.error(message)
         raise ValueError(message)
     command = "xmllint --format --loaddtd " + join(path, 'main.xml')
     command += " --output " + join(path, 'resolved_main.xml')
     dummy, dummy, cmd_err = run_shell_command(command)
     if cmd_err:
         message = "Error in cleaning %s: %s" % (
             join(path, 'main.xml'), cmd_err)
         self.logger.error(message)
         raise ValueError(message)
Пример #26
0
    def _run_rt_command(self, command, uid=None):
        """
        This function will run a RT CLI command as given user. If no user is specified
        the default RT user will be used, if configured.

        Should any of the configuration parameters be missing this function will return
        None. Otherwise it will return the standard output from the CLI command.

        @param command: RT CLI command to execute
        @type command: string

        @param uid: the Invenio user id to submit on behalf of. Optional.
        @type uid: int

        @return: standard output from the command given. None, if any errors.
        @rtype: string
        """
        if not CFG_BIBCATALOG_SYSTEM_RT_URL:
            return None
        if uid:
            username, passwd = get_bibcat_from_prefs(uid)
        else:
            username = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER
            passwd = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD
        httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//")
        bibcatalog_rt_server = httppart + "//" + username + ":" + passwd + "@" + siteandpath
        #set as env var
        os.environ["RTUSER"] = username
        os.environ["RTSERVER"] = bibcatalog_rt_server
        passwd = escape_shell_arg(passwd)
        error_code, myout, error_output = run_shell_command("echo " + passwd + " | " + command)
        if error_code > 0:
            raise ValueError('Problem running "%s": %d - %s' %
                             (command, error_code, error_output))
        return myout
    def ticket_set_attribute(self, uid, ticketid, attribute, new_value):
        """change the ticket's attribute. Returns 1 on success, 0 on failure"""
        #check that the attribute is accepted..
        if attribute not in BibCatalogSystem.TICKET_ATTRIBUTES:
            return 0
        #we cannot change read-only values.. including text that is an attachment. pity
        if attribute in ['creator', 'date', 'ticketid', 'url_close', 'url_display', 'recordid', 'text']:
            return 0
        #check attribute
        setme = ""
        if (attribute == 'priority'):
            try:
                dummy = int(new_value)
            except:
                return 0
            setme = "set Priority=" + str(new_value)
        if (attribute == 'subject'):
            subject = escape_shell_arg(new_value)
            setme = "set Subject='" + subject +"'"

        if (attribute == 'owner'):
            #convert from invenio to RT
            ownerprefs = invenio.webuser.get_user_preferences(new_value)
            if not ownerprefs.has_key("bibcatalog_username"):
                return 0
            else:
                owner = escape_shell_arg(ownerprefs["bibcatalog_username"])
            setme = " set owner='" + owner +"'"

        if (attribute == 'status'):
            setme = " set status='" + escape_shell_arg(new_value) +"'"

        if (attribute == 'queue'):
            setme = " set queue='" + escape_shell_arg(new_value) +"'"

        if not CFG_BIBCATALOG_SYSTEM_RT_URL:
            return 0
        #make sure ticketid is numeric
        try:
            dummy = int(ticketid)
        except:
            return 0
        (username, passwd) = get_bibcat_from_prefs(uid)
        httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//")
        BIBCATALOG_RT_SERVER = httppart + "//" + username + ":" + passwd + "@" + siteandpath
        #set as env var
        os.environ["RTUSER"] = username
        os.environ["RTSERVER"] = BIBCATALOG_RT_SERVER
        passwd = escape_shell_arg(passwd)
        #make a call. safe since passwd and all variables in 'setme' have been escaped
        dummy, myout, dummyerr = run_shell_command("echo "+passwd+" | " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " edit ticket/" + str(ticketid) + setme)
        respOK = False
        mylines = myout.split("\n")
        for line in mylines:
            if line.count('updated') > 0:
                respOK = True
        if respOK:
            return 1
            #print str(mylines)
        return 0
Пример #28
0
 def test_run_cmd_timeout_no_zombie(self):
     """shellutils - running simple command no zombie"""
     t1 = time.time()
     self.assertRaises(Timeout, run_process_with_timeout, (self.script_path, '15', "THISISATEST"), timeout=5)
     ps_output = run_shell_command('ps aux')[1]
     self.failIf('THISISATEST' in ps_output)
     self.failIf('sleep' in ps_output)
Пример #29
0
def convert_xml_using_saxon(source_file, template_file):
    """
    Tries to convert given source file (full path) using XSLT 2.0 Java libraries.

    Looks for given XSLT stylesheet/template file (relative path) in
    CFG_BIBCONVERT_XSL_PATH.

    Path to converted file is derived from DOI in the same directory as source
    as decided inside the template file.

    For example: /path/to/sourcedir/10.1103_PhysRevA.87.052320.xml

    @raise: APSHarvesterConversionError if Java saxon9he-xslt returns error.

    @return: True on success.
    """
    if not os.path.isabs(template_file):
        template_file = CFG_BIBCONVERT_XSL_PATH + os.sep + template_file
    source_directory = os.path.dirname(source_file)
    command = "cd %s && saxon9he-xslt -s:%s -xsl:%s -dtd:off" % \
              (source_directory, source_file, template_file)
    exit_code, stdout_buffer, stderr_buffer = run_shell_command(cmd=command)
    if exit_code or stdout_buffer or stderr_buffer:
        # Error may have happened
        raise APSHarvesterConversionError("%s: %s\nOut:%s" %
                                          (exit_code,
                                          stderr_buffer,
                                          stdout_buffer))
Пример #30
0
def _dump_database(dirname, filename):
    """
    Dump Invenio database into SQL file called FILENAME living in
    DIRNAME.
    """
    write_message("... writing %s" % dirname + os.sep + filename)
    cmd = CFG_PATH_MYSQL + 'dump'
    if not os.path.exists(cmd):
        write_message("ERROR: cannot find %s." % cmd, stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
    cmd += " --skip-opt --add-drop-table --add-locks --create-options " \
           " --quick --extended-insert --set-charset --disable-keys " \
           " --host=%s --user=%s --password=%s %s" % \
           (escape_shell_arg(CFG_DATABASE_HOST),
            escape_shell_arg(CFG_DATABASE_USER),
            escape_shell_arg(CFG_DATABASE_PASS),
            escape_shell_arg(CFG_DATABASE_NAME))
    dummy1, dummy2, dummy3 = run_shell_command(cmd, None, dirname + os.sep + filename)
    if dummy1:
        write_message("ERROR: mysqldump exit code is %s." % repr(dummy1),
                      stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
    if dummy2:
        write_message("ERROR: mysqldump stdout is %s." % repr(dummy1),
                      stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
    if dummy3:
        write_message("ERROR: mysqldump stderr is %s." % repr(dummy1),
                      stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
    def _run_rt_command(self, command, uid=None):
        """
        This function will run a RT CLI command as given user. If no user is specified
        the default RT user will be used, if configured.

        Should any of the configuration parameters be missing this function will return
        None. Otherwise it will return the standard output from the CLI command.

        @param command: RT CLI command to execute
        @type command: string

        @param uid: the Invenio user id to submit on behalf of. Optional.
        @type uid: int

        @return: standard output from the command given. None, if any errors.
        @rtype: string
        """
        if not CFG_BIBCATALOG_SYSTEM_RT_URL:
            return None
        if uid:
            username, passwd = get_bibcat_from_prefs(uid)
        else:
            username = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER
            passwd = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD
        httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//")
        BIBCATALOG_RT_SERVER = httppart + "//" + username + ":" + passwd + "@" + siteandpath
        #set as env var
        os.environ["RTUSER"] = username
        os.environ["RTSERVER"] = BIBCATALOG_RT_SERVER
        passwd = escape_shell_arg(passwd)
        error_code, myout, dummyerr = run_shell_command("echo "+passwd+" | " + command)
        if error_code > 0:
            raise ValueError, 'Problem running "%s": %d' % (command, error_code)
        return myout
Пример #32
0
def get_list_of_all_matching_files(basedir, filetypes):
    """
    This function uses the os module in order tocrawl
    through the directory tree rooted at basedir and find all the files
    therein that include filetype in their 'file' output.  Returns a list
    of absolute paths to all files.

    @param: basedir (string): the directory where we want to start crawling
    @param: filetypes ([string, string]): something that will be contained in
        the output of running 'file' on the types of files we're looking for

    @return: file_paths ([string, string, ...]): a list of full paths to
        the files that we discovered
    """

    file_paths = []

    for dirpath, dummy0, filenames in os.walk(basedir):
        for filename in filenames:
            full_path = os.path.join(dirpath, filename)
            dummy1, cmd_out, dummy2 = run_shell_command("file %s", (full_path,))
            for filetype in filetypes:
                if cmd_out.find(filetype) > -1:
                    file_paths.append(full_path)

    return file_paths
Пример #33
0
    def _normalize_article_dir_with_dtd(self, path):
        """
        main.xml from Elsevier assume the existence of a local DTD.
        This procedure install the DTDs next to the main.xml file
        and normalize it using xmllint in order to resolve all namespaces
        and references.
        """
        if exists(join(path, 'resolved_main.xml')):
            return
        if 'art520' in open(join(path, 'main.xml')).read():
            ZipFile(CFG_ELSEVIER_ART520_PATH).extractall(path)
            for filename in listdir(join(path, 'art520')):
                rename(join(path, 'art520', filename), join(path, filename))
        elif 'art501' in open(join(path, 'main.xml')).read():
            ZipFile(CFG_ELSEVIER_ART501_PATH).extractall(path)
            for filename in listdir(join(path, 'art501')):
                rename(join(path, 'art501', filename), join(path, filename))
        else:
            self.logger.error("It looks like the path %s does not contain an art520 or art501 main.xml file" % path)
            raise ValueError("It looks like the path %s does not contain an art520 or art501 main.xml file" % path)

        cmd_exit_code, cmd_out, cmd_err = run_shell_command("xmllint --format --loaddtd %s --output %s", (join(path, 'main.xml'), join(path, 'resolved_main.xml')))
        if cmd_err:
            self.logger.error("Error in cleaning %s: %s" % (join(path, 'issue.xml'), cmd_err))
            raise ValueError("Error in cleaning %s: %s" % (join(path, 'main.xml'), cmd_err))
def get_list_of_all_matching_files(basedir, filetypes):
    """
    This function uses the os module in order tocrawl
    through the directory tree rooted at basedir and find all the files
    therein that include filetype in their 'file' output.  Returns a list
    of absolute paths to all files.

    @param: basedir (string): the directory where we want to start crawling
    @param: filetypes ([string, string]): something that will be contained in
        the output of running 'file' on the types of files we're looking for

    @return: file_paths ([string, string, ...]): a list of full paths to
        the files that we discovered
    """

    file_paths = []

    for dirpath, dummy0, filenames in os.walk(basedir):
        for filename in filenames:
            full_path = os.path.join(dirpath, filename)
            dummy1, cmd_out, dummy2 = run_shell_command('file %s', (full_path,))
            for filetype in filetypes:
                if cmd_out.find(filetype) > -1:
                    file_paths.append(full_path)

    return file_paths
Пример #35
0
    def _normalize_article_dir_with_dtd(self, path):
        """
        TODO: main.xml from Springer assume the existence of a local DTD.
        This procedure install the DTDs next to the main.xml file
        and normalize it using xmllint in order to resolve all namespaces
        and references.
        """
        files = [filename for filename in listdir(path) if "nlm.xml" in filename]
        if not files:
                files = [filename for filename in listdir(path)  if ".xml.scoap" in filename]
        if exists(join(path, 'resolved_main.xml')):
            return

        if 'JATS-archivearticle1.dtd' in open(join(path, files[0])).read():
            path_normalized = mkdtemp(prefix="scoap3_normalized_jats_", dir=CFG_TMPSHAREDDIR)
            ZipFile(CFG_SPRINGER_JATS_PATH).extractall(path_normalized)
        elif 'A++V2.4.dtd' in open(join(path, files[0])).read():
            path_normalized = mkdtemp(prefix="scoap3_normalized_app_", dir=CFG_TMPSHAREDDIR)
            ZipFile(CFG_SPRINGER_AV24_PATH).extractall(path_normalized)
        else:
            self.logger.error("It looks like the path %s does not contain an JATS-archivearticle1.dtd nor A++V2.4.dtd XML file." % path)
            raise ValueError("It looks like the path %s does not contain an JATS-archivearticle1.dtd nor A++V2.4.dtd XML file." % path)
        print >> sys.stdout, "Normalizing %s" % (files[0],)
        cmd_exit_code, cmd_out, cmd_err = run_shell_command("xmllint --format --loaddtd %s --output %s", (join(path, files[0]), join(path_normalized, 'resolved_main.xml')))
        if cmd_err:
            self.logger.error("Error in cleaning %s: %s" % (join(path, 'issue.xml'), cmd_err))
            raise ValueError("Error in cleaning %s: %s" % (join(path, 'main.xml'), cmd_err))
        self.articles_normalized.append(path_normalized)
Пример #36
0
 def test_get_defaults(self):
     """plotextractor - get defaults"""
     sdir_should_be = os.path.join(CFG_TMPSHAREDDIR,
                                   self.arXiv_id + '_plots')
     refno_should_be = "15"  # Note: For ATLANTIS DEMO site
     sdir, refno = get_defaults(tarball=self.tarball,
                                sdir=None,
                                refno_url=CFG_SITE_URL)
     if sdir is not None:
         run_shell_command("rm -rf %s" % sdir)
     self.assertTrue(
         sdir == sdir_should_be,
         "didn't get correct default scratch dir: expected %s found %s" %
         (sdir_should_be, sdir))
     self.assertTrue(refno == refno_should_be,
                     "didn't get correct default reference number")
Пример #37
0
def find_matching_files(basedir, filetypes):
    """
    This functions tries to find all files matching given filetypes by looking at
    all the files and filenames in the given directory, including subdirectories.

    @param basedir: full path to base directory to search in
    @type basedir: string

    @param filetypes: list of filetypes, extensions
    @type filetypes: list

    @return: exitcode and any error messages as: (exitcode, err_msg)
    @rtype: tuple
    """
    files_list = []
    for dirpath, dummy0, filenames in os.walk(basedir):
        for filename in filenames:
            full_path = os.path.join(dirpath, filename)
            dummy1, cmd_out, dummy2 = run_shell_command(
                'file %s', (full_path, ))
            for filetype in filetypes:
                if cmd_out.lower().find(filetype) > -1:
                    files_list.append(full_path)
                elif filename.split('.')[-1].lower() == filetype:
                    files_list.append(full_path)
    return files_list
Пример #38
0
def rotate_image(filename, line, sdir, image_list):
    '''
    Given a filename and a line, figure out what it is that the author
    wanted to do wrt changing the rotation of the image and convert the
    file so that this rotation is reflected in its presentation.

    @param: filename (string): the name of the file as specified in the TeX
    @param: line (string): the line where the rotate command was found

    @output: the image file rotated in accordance with the rotate command
    @return: True if something was rotated
    '''

    file_loc = get_image_location(filename, sdir, image_list)
    degrees = re.findall('(angle=[-\\d]+|rotate=[-\\d]+)', line)

    if len(degrees) < 1:
        return False

    degrees = degrees[0].split('=')[-1].strip()

    if file_loc == None or file_loc == 'ERROR' or\
            not re.match('-*\\d+', degrees):
        return False

    degrees = str(0-int(degrees))

    cmd = 'mogrify -rotate ' + degrees + ' ' + file_loc
    dummy, dummy, cmd_err = run_shell_command(cmd)

    if cmd_err != '':
        return True
    else:
        return True
Пример #39
0
def _dump_database(dirname, filename):
    """
    Dump Invenio database into SQL file called FILENAME living in
    DIRNAME.
    """
    write_message("... writing %s" % dirname + os.sep + filename)
    cmd = CFG_PATH_MYSQL + 'dump'
    if not os.path.exists(cmd):
        msg = "ERROR: cannot find %s." % cmd
        write_message(msg, stream=sys.stderr)
        raise StandardError(msg)

    cmd += " --skip-opt --add-drop-table --add-locks --create-options " \
           " --quick --extended-insert --set-charset --disable-keys " \
           " --host=%s --user=%s --password=%s %s | %s -c " % \
           (escape_shell_arg(CFG_DATABASE_HOST),
            escape_shell_arg(CFG_DATABASE_USER),
            escape_shell_arg(CFG_DATABASE_PASS),
            escape_shell_arg(CFG_DATABASE_NAME),
            CFG_PATH_GZIP)
    dummy1, dummy2, dummy3 = run_shell_command(cmd, None,
                                               dirname + os.sep + filename)
    if dummy1:
        msg = "ERROR: mysqldump exit code is %s." % repr(dummy1)
        write_message(msg, stream=sys.stderr)
        raise StandardError(msg)
    if dummy2:
        msg = "ERROR: mysqldump stdout is %s." % repr(dummy1)
        write_message(msg, stream=sys.stderr)
        raise StandardError(msg)
    if dummy3:
        msg = "ERROR: mysqldump stderr is %s." % repr(dummy1)
        write_message(msg, stream=sys.stderr)
        raise StandardError(msg)
Пример #40
0
def find_matching_files(basedir, filetypes):
    """
    This functions tries to find all files matching given filetypes by looking at
    all the files and filenames in the given directory, including subdirectories.

    @param basedir: full path to base directory to search in
    @type basedir: string

    @param filetypes: list of filetypes, extensions
    @type filetypes: list

    @return: exitcode and any error messages as: (exitcode, err_msg)
    @rtype: tuple
    """
    files_list = []
    for dirpath, dummy0, filenames in os.walk(basedir):
        for filename in filenames:
            full_path = os.path.join(dirpath, filename)
            dummy1, cmd_out, dummy2 = run_shell_command('file %s', (full_path,))
            for filetype in filetypes:
                if cmd_out.lower().find(filetype) > -1:
                    files_list.append(full_path)
                elif filename.split('.')[-1].lower() == filetype:
                    files_list.append(full_path)
    return files_list
 def test_insert_with_nonce(self):
     """batchuploader - robotupload insert with nonce"""
     result = urllib2.urlopen(self.req_nonce).read()
     self.failUnless("[INFO]" in result, '"%s" did not contained "[INFO]"' % result)
     current_task = get_last_taskid()
     run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)])
     results = json.loads(open(self.callback_result_path).read())
     self.failUnless("results" in results, '"%s" did not contained "results" key' % results)
     self.assertEqual(len(results["results"]), 1)
     self.assertEqual(results["nonce"], "1234")
     self.failUnless(results["results"][0]["success"])
     self.failUnless(results["results"][0]["recid"] > 0)
     self.failUnless(
         """<subfield code="a">Doe, John</subfield>""" in results["results"][0]["marcxml"],
         results["results"][0]["marcxml"],
     )
Пример #42
0
def convert_xml_using_saxon(source_file, template_file):
    """
    Tries to convert given source file (full path) using XSLT 2.0 Java libraries.

    Looks for given XSLT stylesheet/template file (relative path) in
    CFG_BIBCONVERT_XSL_PATH.

    Path to converted file is derived from DOI in the same directory as source
    as decided inside the template file.

    For example: /path/to/sourcedir/10.1103_PhysRevA.87.052320.xml

    @raise: APSHarvesterConversionError if Java saxon9he-xslt returns error.

    @return: True on success.
    """
    if not os.path.isabs(template_file):
        template_file = CFG_BIBCONVERT_XSL_PATH + os.sep + template_file
    source_directory = os.path.dirname(source_file)
    command = "cd %s && saxon9he-xslt -s:%s -xsl:%s -dtd:off" % \
              (source_directory, source_file, template_file)
    exit_code, stdout_buffer, stderr_buffer = run_shell_command(cmd=command)
    if exit_code or stdout_buffer or stderr_buffer:
        # Error may have happened
        raise APSHarvesterConversionError(
            "%s: %s\nOut:%s" % (exit_code, stderr_buffer, stdout_buffer))
Пример #43
0
 def test_run_cmd_hello_quote(self):
     """shellutils - running simple command with an argument with quote"""
     self.assertEqual((0, "hel'lo world\n", ''),
                      run_shell_command("echo %s %s", (
                          "hel'lo",
                          "world",
                      )))
Пример #44
0
def extract_plots_from_latex_and_pdf(url_tarball, url_pdf):
    tarball = download_url(url_tarball)
    path, name = os.path.split(url_tarball)
    #run_shell_command('cp ' + tarball + ' ' + tarball + '_arxiv' + name)
    tarball_with_arxiv_extension = tarball + '_arxiv' + name
    os.rename(tarball, tarball_with_arxiv_extension)
    run_shell_command(CFG_BINDIR + '/plotextractor -t ' +
                      tarball_with_arxiv_extension)

    pdf = download_url(url_pdf)
    run_shell_command(CFG_PDFPLOTEXTRACTOR_PATH + ' ' + pdf)

    path, name = os.path.split(tarball_with_arxiv_extension)
    plotextracted_xml_path = tarball_with_arxiv_extension + '_plots/' + name + '.xml'
    plotextracted_pdf_path = pdf + '.extracted/extracted.json'

    return plotextracted_xml_path, plotextracted_pdf_path
Пример #45
0
    def ticket_submit(self, uid=None, subject="", recordid=-1, text="", queue="",
        priority="", owner="", requestor=""):
        """creates a ticket. return ticket num on success, otherwise None"""
        if not CFG_BIBCATALOG_SYSTEM_RT_URL:
            return None
        if uid:
            username, passwd = get_bibcat_from_prefs(uid)
        else:
            username = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER
            passwd = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD
        httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//")
        BIBCATALOG_RT_SERVER = httppart + "//" + username + ":" + passwd + "@" + siteandpath
        #set as env var
        os.environ["RTUSER"] = username
        os.environ["RTSERVER"] = BIBCATALOG_RT_SERVER
        queueset = ""
        textset = ""
        priorityset = ""
        ownerset = ""
        subjectset = ""
        requestorset = ""
        if subject:
            subjectset = " subject=" + escape_shell_arg(subject)
        recidset = " CF-RecordID=" + escape_shell_arg(str(recordid))
        if text:
            textset = " text=" + escape_shell_arg(text)
        if priority:
            priorityset = " priority=" + escape_shell_arg(str(priority))
        if queue:
            queueset = " queue=" + escape_shell_arg(queue)
        if requestor:
            requestorset = " requestor=" + escape_shell_arg(requestor)
        if owner:
            #get the owner name from prefs
            ownerprefs = invenio.webuser.get_user_preferences(owner)
            if ownerprefs.has_key("bibcatalog_username"):
                owner = ownerprefs["bibcatalog_username"]
                ownerset = " owner=" + escape_shell_arg(owner)
        #make a command.. note that all set 'set' parts have been escaped

        command = CFG_BIBCATALOG_SYSTEM_RT_CLI + " create -t ticket set " + subjectset + recidset + \
                  queueset + textset + priorityset + ownerset + requestorset

        passwd = escape_shell_arg(passwd)
        #make a call.. passwd and command have been escaped (see above)
        dummy, myout, dummyerr = run_shell_command("echo "+passwd+" | " + command)
        inum = -1
        for line in myout.split("\n"):
            if line.count(' ') > 0:
                stuff = line.split(' ')
                try:
                    inum = int(stuff[2])
                except:
                    pass
        if inum > 0:
            return inum
        return None
 def test_insert_via_curl(self):
     """batchuploader - robotupload insert via CLI curl"""
     curl_input_file = os.path.join(CFG_TMPDIR, 'curl_test.xml')
     open(curl_input_file, "w").write(self.marcxml)
     try:
         result = run_shell_command('/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"', [curl_input_file, self.nonce_url, make_user_agent_string('BatchUploader')])[1]
         self.failUnless("[INFO]" in result)
         current_task = get_last_taskid()
         run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)])
         results = json.loads(open(self.callback_result_path).read())
         self.failUnless('results' in results, '"%s" did not contained [INFO]' % result)
         self.assertEqual(len(results['results']), 1)
         self.assertEqual(results['nonce'], "1234")
         self.failUnless(results['results'][0]['success'])
         self.failUnless(results['results'][0]['recid'] > 0)
         self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
     finally:
         os.remove(curl_input_file)
    def ticket_submit(self,
                      uid,
                      subject,
                      recordid,
                      text="",
                      queue="",
                      priority="",
                      owner=""):
        """creates a ticket. return ticket num on success, otherwise None"""
        if not CFG_BIBCATALOG_SYSTEM_RT_URL:
            return None
        (username, passwd) = get_bibcat_from_prefs(uid)
        httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//")
        BIBCATALOG_RT_SERVER = httppart + "//" + username + ":" + passwd + "@" + siteandpath
        #set as env var
        os.environ["RTUSER"] = username
        os.environ["RTSERVER"] = BIBCATALOG_RT_SERVER
        queueset = ""
        textset = ""
        priorityset = ""
        ownerset = ""
        subjectset = ""
        if subject:
            subjectset = " subject=" + escape_shell_arg(subject)
        recidset = " CF-RecordID=" + escape_shell_arg(str(recordid))
        if text:
            textset = " text=" + escape_shell_arg(text)
        if priority:
            priorityset = " priority=" + escape_shell_arg(str(priority))
        if queue:
            queueset = " queue=" + escape_shell_arg(queue)
        if owner:
            #get the owner name from prefs
            ownerprefs = invenio.webuser.get_user_preferences(owner)
            if ownerprefs.has_key("bibcatalog_username"):
                owner = ownerprefs["bibcatalog_username"]
                ownerset = " owner=" + escape_shell_arg(owner)
        #make a command.. note that all set 'set' parts have been escaped

        command = CFG_BIBCATALOG_SYSTEM_RT_CLI + " create -t ticket set " + subjectset + recidset + \
                  queueset + textset + priorityset + ownerset

        passwd = escape_shell_arg(passwd)
        #make a call.. passwd and command have been escaped (see above)
        dummy, myout, dummyerr = run_shell_command("echo " + passwd + " | " +
                                                   command)
        inum = -1
        for line in myout.split("\n"):
            if line.count(' ') > 0:
                stuff = line.split(' ')
                try:
                    inum = int(stuff[2])
                except:
                    pass
        if inum > 0:
            return inum
        return None
Пример #48
0
 def test_run_cmd_timeout_no_zombie(self):
     """shellutils - running simple command no zombie"""
     t1 = time.time()
     self.assertRaises(Timeout,
                       run_process_with_timeout,
                       (self.script_path, '15', "THISISATEST"),
                       timeout=5)
     ps_output = run_shell_command('ps aux')[1]
     self.failIf('THISISATEST' in ps_output)
     self.failIf('sleep' in ps_output)
Пример #49
0
def get_my_pid(process, args=''):
    if sys.platform.startswith('freebsd'):
        command = "ps -o pid,args | grep '%s %s' | grep -v 'grep' | sed -n 1p" % (process, args)
    else:
        command = "ps -C %s o '%%p%%a' | grep '%s %s' | grep -v 'grep' | sed -n 1p" % (process, process, args)
    answer = run_shell_command(command)[1].strip()
    if answer == '':
        answer = 0
    else:
        answer = answer[:answer.find(' ')]
    return int(answer)
Пример #50
0
def load_submission(doctype, dump, method=None):
    "Insert submission into database. Return tuple(error code, msg)"
    # NOT TESTED
    messages = []

    def guess_dump_method(dump):
        """Guess which method was used to dump this file (i.e. if it contains all the submission rows or not)"""
        match_obj = re_method_pattern.search(dump)
        if match_obj:
            return match_obj.group('method')
        else:
            return None

    def guess_dump_has_delete_statements(dump):
        """Guess if given submission dump already contain delete statements"""
        return "DELETE FROM sbmDOCTYPE WHERE sdocname".lower() in dump.lower()

    if not method:
        method = guess_dump_method(dump)
        if method is None:
            method = CFG_WEBSUBMIT_DUMPER_DEFAULT_METHOD
            messages.append(
                "WARNING: method could not be guessed. Using method %s" %
                method)
        else:
            messages.append("Used method %s to load data" % method)

    (dump_code, dump_path) = tempfile.mkstemp(prefix=doctype, dir=CFG_TMPDIR)
    dump_fd = open(dump_path, 'w')
    dump_fd.write(dump)
    dump_fd.close()

    # We need to remove the submission. But let's create a backup first.
    submission_backup = dump_submission(doctype, method)
    submission_backup_path = "%s_db_dump%s.sql" % (
        doctype, time.strftime("%Y%m%d_%H%M%S"))
    fd = file(os.path.join(CFG_TMPDIR, submission_backup_path), "w")
    fd.write(submission_backup)
    fd.close()
    if not guess_dump_has_delete_statements(dump):
        remove_submission(doctype, method)

    # Load the dump
    (exit_code, out_msg,
     err_msg) = run_shell_command("%s/bin/dbexec < %s",
                                  (CFG_PREFIX, os.path.abspath(dump_path)))
    if exit_code:
        messages.append("ERROR: failed to load submission:" + err_msg)
        return (1, messages)

    messages.append("Submission loaded. Previous submission saved to %s" %
                    os.path.join(CFG_TMPDIR, submission_backup_path))
    return (0, messages)
Пример #51
0
def get_text_snippets(textfile_path, patterns, nb_chars, max_snippets):
    """
    Extract text snippets around 'patterns' from the file found at
    'textfile_path'. The snippets are meant to look similar to results of
    popular Internet search engines: using " ... " between snippets.
    For empty patterns it returns ""
    """
    """
    TODO: - distinguish the beginning of sentences and make the snippets
            start there
          - optimize finding patterns - first search for patterns apperaing next
            to each other, secondly look for each patten not for first
            occurances of any pattern
    """

    if len(patterns) == 0:
        return ""

    max_lines = nb_chars / 40 + 2  # rule of thumb in order to catch nb_chars
    # Produce the big snippets from which the real snippets will be cut out
    cmd = "grep -i -C%s -m%s"
    cmdargs = [str(max_lines), str(max_snippets)]
    for p in patterns:
        cmd += " -e %s"
        cmdargs.append(" " + p)
    cmd += " %s"
    cmdargs.append(textfile_path)
    (dummy1, output, dummy2) = run_shell_command(cmd, cmdargs)
    # a fact to keep in mind with this call to grep is that if patterns appear
    # in two contigious lines, they will not be separated by '--' and therefore
    # treated as one 'big snippet'
    result = []
    big_snippets = output.split("--")

    # cut the snippets to match the nb_words_around parameter precisely:
    for s in big_snippets:
        small_snippet = cut_out_snippet(s, patterns, nb_chars)
        result.append(small_snippet)

    # combine snippets
    out = ""
    count = 0
    for snippet in result:
        if snippet and count < max_snippets:
            if out:
                out += "..."
            out += highlight(snippet, patterns, whole_word_matches=True)

    return out
Пример #52
0
def get_text_snippets(textfile_path, patterns, nb_chars, max_snippets):
    """
    Extract text snippets around 'patterns' from the file found at
    'textfile_path'. The snippets are meant to look similar to results of
    popular Internet search engines: using " ... " between snippets.
    For empty patterns it returns ""
    """
    """
    TODO: - distinguish the beginning of sentences and make the snippets
            start there
          - optimize finding patterns - first search for patterns apperaing next
            to each other, secondly look for each patten not for first
            occurances of any pattern
    """

    if len(patterns) == 0:
        return ""

    max_lines = nb_chars / 40 + 2  # rule of thumb in order to catch nb_chars
    # Produce the big snippets from which the real snippets will be cut out
    cmd = "grep -i -C%s -m%s"
    cmdargs = [str(max_lines), str(max_snippets)]
    for p in patterns:
        cmd += " -e %s"
        cmdargs.append(" " + p)
    cmd += " %s"
    cmdargs.append(textfile_path)
    (dummy1, output, dummy2) = run_shell_command(cmd, cmdargs)
    # a fact to keep in mind with this call to grep is that if patterns appear
    # in two contigious lines, they will not be separated by '--' and therefore
    # treated as one 'big snippet'
    result = []
    big_snippets = output.split("--")

    # cut the snippets to match the nb_words_around parameter precisely:
    for s in big_snippets:
        small_snippet = cut_out_snippet(s, patterns, nb_chars)
        result.append(small_snippet)

    # combine snippets
    out = ""
    count = 0
    for snippet in result:
        if snippet and count < max_snippets:
            if out:
                out += "..."
            out += highlight(snippet, patterns, whole_word_matches=True)

    return out
Пример #53
0
def prettify_xml(filepath):
    """
    Will prettify an XML file for better readability.

    Returns the new, pretty, file.
    """
    new_filename = "%s_pretty.xml" % (os.path.splitext(filepath)[0],)
    cmd = "xmllint --format %s" % (filepath,)
    exit_code, std_out, err_msg = run_shell_command(cmd=cmd, filename_out=new_filename)
    if exit_code:
        write_message("\nError caught: %s" % (err_msg,))
        task_update_status("CERROR")
        return

    return new_filename
Пример #54
0
def prettify_xml(filepath):
    """
    Will prettify an XML file for better readability.

    Returns the new, pretty, file.
    """
    new_filename = "%s_pretty.xml" % (os.path.splitext(filepath)[0],)
    cmd = "xmllint --format %s" % (filepath,)
    exit_code, std_out, err_msg = run_shell_command(cmd=cmd,
                                                    filename_out=new_filename)
    if exit_code:
        write_message("\nError caught: %s" % (err_msg,))
        task_update_status("CERROR")
        return

    return new_filename
Пример #55
0
def version(separator='\n'):
    """
    Try to detect Apache version by localizing httpd or apache
    executables and grepping inside binaries.  Return list of all
    found Apache versions and paths.  (For a given executable, the
    returned format is 'apache_version [apache_path]'.)  Return empty
    list if no success.
    """
    from invenio.inveniocfg import _grep_version_from_executable
    from invenio.shellutils import run_shell_command
    out = []
    dummy1, cmd_out, dummy2 = run_shell_command("locate bin/httpd bin/apache")
    for apache in cmd_out.split("\n"):
        apache_version = _grep_version_from_executable(apache, '^Apache\/')
        if apache_version:
            out.append("%s [%s]" % (apache_version, apache))
    return separator.join(out)
Пример #56
0
def task_run_core(recid):
    pdf = look_for_fulltext(recid)
    write_message('pdf: %s' % pdf)
    if pdf:
        tmpfd, tmppath = mkstemp(prefix="plotextractor-", suffix=".pdf")
        try:
            # tmpfd is being closed by copyfileobj
            copyfileobj(open(pdf), os.fdopen(tmpfd,'w'))
            (exit_code, output_buffer,stderr_output_buffer) = run_shell_command(CFG_PDFPLOTEXTRACTOR_PATH + ' ' + tmppath)
            plotextracted_pdf_path = tmppath + ".extracted/extracted.json"
            code, output_vector, extracted = merging_articles(None, plotextracted_pdf_path)
        finally:
            os.remove(tmppath)
        try:
            id_fulltext = get_fieldvalues([recid], "037_a")[0]
        except IndexError:
            id_fulltext = ""
        create_MARCXML(output_vector, id_fulltext, code, extracted, write_file=True)
def convert_images(image_list):
    """
    Here we figure out the types of the images that were extracted from
    the tarball and determine how to convert them into PNG.

    @param: image_list ([string, string, ...]): the list of image files
        extracted from the tarball in step 1

    @return: image_list ([str, str, ...]): The list of image files when all
        have been converted to PNG format.
    """
    png_output_contains = 'PNG image'
    ret_list = []
    for image_file in image_list:
        if os.path.isdir(image_file):
            continue

        # FIXME: here and everywhere else in the plot extractor
        # library the run shell command statements should be (1)
        # called with timeout in order to prevent runaway imagemagick
        # conversions; (2) the arguments should be passed properly so
        # that they are escaped.

        dummy1, cmd_out, dummy2 = run_shell_command('file %s', (image_file,))
        if cmd_out.find(png_output_contains) > -1:
            ret_list.append(image_file)
        else:
            # we're just going to assume that ImageMagick can convert all
            # the image types that we may be faced with
            # for sure it can do EPS->PNG and JPG->PNG and PS->PNG
            # and PSTEX->PNG
            converted_image_file = get_converted_image_name(image_file)
            try:
                dummy1, cmd_out, cmd_err = run_process_with_timeout('convert %s %s'\
                                                   % (image_file, \
                                                      converted_image_file), shell = True)
                if cmd_err == '':
                    ret_list.append(converted_image_file)
                else:
                    write_message('convert failed on ' + image_file)
            except Timeout:
                write_message('convert timed out on ' + image_file)

    return ret_list
Пример #58
0
def process_pdf(pdf, id):
    write_message("process pdf")
    (exit_code, output_buffer,
     stderr_output_buffer) = run_shell_command(CFG_PDFPLOTEXTRACTOR_PATH +
                                               ' ' + pdf)
    plotextracted_pdf_path = pdf + ".extracted/extracted.json"

    (code, message, dummy,
     list_of_figures_from_pdf) = getFigureVectors('', plotextracted_pdf_path)
    extracted = pdf + ".extracted"
    # Create MARCXML from json file
    # @param extracted - output file with the MARCXML
    marc_path = create_MARCXML(list_of_figures_from_pdf, id, code, extracted,
                               True)
    write_message("end process pdf")
    now = datetime.datetime.now()
    stderr_output_buffer = "[" + str(
        now
    ) + "]: The Pdf extractor for the file " + pdf + " has an error. The traceback:\n" + stderr_output_buffer
    return (exit_code, stderr_output_buffer, plotextracted_pdf_path, marc_path)
Пример #59
0
def read_metadata_local(inputfile, verbose):
    """
    Metadata extraction from many kind of files

    @param inputfile: path to the image
    @type inputfile: string
    @param verbose: verbosity
    @type verbose: int
    @rtype: dict
    @return: dictionary with metadata
    """
    cmd = CFG_PATH_PDFTK + ' %s dump_data'
    (exit_status, output_std, output_err) = \
                      run_shell_command(cmd, args=(inputfile,))
    metadata_dict = {}
    key = None
    value = None
    for metadata_line in output_std.splitlines():
        if metadata_line.strip().startswith("InfoKey"):
            key = metadata_line.split(':', 1)[1].strip()
        elif metadata_line.strip().startswith("InfoValue"):
            value = metadata_line.split(':', 1)[1].strip()
            if key in ["ModDate", "CreationDate"]:
                # FIXME: Interpret these dates?
                try:
                    pass
                    #value = datetime.strptime(value, "D:%Y%m%d%H%M%S%Z")
                except:
                    pass
            if key:
                metadata_dict[key] = value
                key = None
        else:
            try:
                custom_key, custom_value = metadata_line.split(':', 1)
                metadata_dict[custom_key.strip()] = custom_value.strip()
            except:
                # Most probably not relevant line
                pass

    return metadata_dict