def test_insert_via_curl(self):
     """batchuploader - robotupload insert via CLI curl"""
     if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK:
         if CFG_HAS_CURL:
             curl_input_file = os.path.join(cfg['CFG_TMPDIR'],
                                            'curl_test.xml')
             open(curl_input_file, "w").write(self.marcxml)
             try:
                 result = run_shell_command(
                     '/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"',
                     [
                         curl_input_file, self.nonce_url,
                         make_user_agent_string('BatchUploader')
                     ])[1]
                 self.failUnless("[INFO]" in result)
                 current_task = get_last_taskid()
                 run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'],
                                   [str(current_task)])
                 results = json.loads(
                     open(self.callback_result_path).read())
                 self.failUnless('results' in results,
                                 '"%s" did not contained [INFO]' % result)
                 self.assertEqual(len(results['results']), 1)
                 self.assertEqual(results['nonce'], "1234")
                 self.failUnless(results['results'][0]['success'])
                 self.failUnless(results['results'][0]['recid'] > 0)
                 self.failUnless(
                     """<subfield code="a">Doe, John</subfield>"""
                     in results['results'][0]['marcxml'],
                     results['results'][0]['marcxml'])
             finally:
                 os.remove(curl_input_file)
Пример #2
0
def get_plots(tarball):
    """Return a list of found and converted plots given a tarball."""
    sub_dir, dummy = get_defaults(tarball, cfg["CFG_TMPDIR"], "")

    tex_files = None
    image_list = None

    dummy, image_list, tex_files = untar(tarball, sub_dir)

    converted_image_list = convert_images(image_list)
    extracted_image_data = []
    if tex_files == [] or tex_files is None:
        # Its not a tarball
        run_shell_command("rm -r %s", (sub_dir,))
    else:
        for tex_file in tex_files:
            # Extract images, captions and labels
            partly_extracted_image_data = extract_captions(tex_file, sub_dir, converted_image_list)
            if partly_extracted_image_data:
                # Add proper filepaths and do various cleaning
                cleaned_image_data = prepare_image_data(partly_extracted_image_data, tex_file, converted_image_list)
                # Using prev. extracted info, get contexts for each
                # image found
                extracted_image_data.extend((extract_context(tex_file, cleaned_image_data)))
    return extracted_image_data
Пример #3
0
def get_plots(tarball):
    """Return a list of found and converted plots given a tarball."""
    sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "")

    tex_files = None
    image_list = None

    dummy, image_list, tex_files = untar(tarball, sub_dir)

    converted_image_list = convert_images(image_list)
    extracted_image_data = []
    if tex_files == [] or tex_files is None:
        # Its not a tarball
        run_shell_command('rm -r %s', (sub_dir, ))
    else:
        for tex_file in tex_files:
            # Extract images, captions and labels
            partly_extracted_image_data = extract_captions(
                tex_file, sub_dir, converted_image_list)
            if partly_extracted_image_data:
                # Add proper filepaths and do various cleaning
                cleaned_image_data = prepare_image_data(
                    partly_extracted_image_data, tex_file,
                    converted_image_list)
                # Using prev. extracted info, get contexts for each
                # image found
                extracted_image_data.extend(
                    (extract_context(tex_file, cleaned_image_data)))
    return extracted_image_data
Пример #4
0
def generate_preview(f):
    """Generate PNG previews of PDF pages."""
    directory = os.path.join(current_app.instance_path, "previews")
    try:
        os.mkdir(directory)
    except OSError:  # directory already exists as per docs
        pass

    directory = os.path.join(directory, str(f.get_recid()))
    try:
        os.mkdir(directory)
    except OSError:  # directory already exists as per docs, preview exists
        return directory

    cmd_pdftk = "pdftk %s burst output %s/pg_%s.pdf"
    (exit_status, output_std, output_err) = \
        run_shell_command(cmd_pdftk, args=(get_pdf_path(f), directory, '%d'))
    cmd_pdftk = '%s -flatten -density 300 %s %s/`basename %s .pdf`.png'
    for fl in os.listdir(directory):
        if fl.endswith(".pdf"):
            fn = safe_join(directory, fl)
            (exit_status, output_std, output_err) = \
                run_shell_command(cmd_pdftk, args=(
                    str(cfg["CFG_PATH_CONVERT"]), fn, directory,
                    fn))
Пример #5
0
    def check_system(self, uid=None):
        """return an error string if there are problems"""
        if uid:
            rtuid, rtpw = get_bibcat_from_prefs(uid)
        else:
            # Assume default RT user
            rtuid = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER
            rtpw = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD

        if not rtuid and not rtpw:
            return "No valid RT user login specified"

        if not CFG_BIBCATALOG_SYSTEM == "RT":
            return "CFG_BIBCATALOG_SYSTEM is not RT though this is an RT module"
        if not CFG_BIBCATALOG_SYSTEM_RT_CLI:
            return "CFG_BIBCATALOG_SYSTEM_RT_CLI not defined or empty"
        if not os.path.exists(CFG_BIBCATALOG_SYSTEM_RT_CLI):
            return "CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " file does not exists"

        # Check that you can execute the binary.. this is a safe call unless someone can fake CFG_BIBCATALOG_SYSTEM_RT_CLI (unlikely)
        dummy, myout, myerr = run_shell_command(CFG_BIBCATALOG_SYSTEM_RT_CLI + " help")
        helpfound = False
        if myerr.count("help") > 0:
            helpfound = True
        if not helpfound:
            return (
                "Execution of CFG_BIBCATALOG_SYSTEM_RT_CLI "
                + CFG_BIBCATALOG_SYSTEM_RT_CLI
                + " help did not produce output 'help'"
            )

        if not CFG_BIBCATALOG_SYSTEM_RT_URL:
            return "CFG_BIBCATALOG_SYSTEM_RT_URL not defined or empty"
        # Construct URL, split RT_URL at //
        if not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith("http://") and not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith(
            "https://"
        ):
            return "CFG_BIBCATALOG__SYSTEM_RT_URL does not start with 'http://' or 'https://'"
        httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//")
        # Assemble by http://user:password@RT_URL
        bibcatalog_rt_server = httppart + "//" + rtuid + ":" + rtpw + "@" + siteandpath

        # set as env var
        os.environ["RTUSER"] = rtuid
        os.environ["RTSERVER"] = bibcatalog_rt_server

        # try to talk to RT server
        # this is a safe call since rtpw is the only variable in it, and it is escaped
        rtpw = escape_shell_arg(rtpw)
        dummy, myout, myerr = run_shell_command(
            "echo " + rtpw + " | " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " ls \"Subject like 'F00'\""
        )
        if len(myerr) > 0:
            return "could not connect to " + bibcatalog_rt_server + " " + myerr
        # finally, check that there is some sane output like tickets or 'No matching result'
        saneoutput = (myout.count("matching") > 0) or (myout.count("1") > 0)
        if not saneoutput:
            return CFG_BIBCATALOG_SYSTEM_RT_CLI + " returned " + myout + " instead of 'matching' or '1'"
        return ""
Пример #6
0
    def check_system(self, uid=None):
        """return an error string if there are problems"""
        if uid:
            rtuid, rtpw = get_bibcat_from_prefs(uid)
        else:
            # Assume default RT user
            rtuid = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER
            rtpw = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD

        if not rtuid and not rtpw:
            return "No valid RT user login specified"

        if not CFG_BIBCATALOG_SYSTEM == 'RT':
            return "CFG_BIBCATALOG_SYSTEM is not RT though this is an RT module"
        if not CFG_BIBCATALOG_SYSTEM_RT_CLI:
            return "CFG_BIBCATALOG_SYSTEM_RT_CLI not defined or empty"
        if not os.path.exists(CFG_BIBCATALOG_SYSTEM_RT_CLI):
            return "CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " file does not exists"

        # Check that you can execute the binary.. this is a safe call unless someone can fake CFG_BIBCATALOG_SYSTEM_RT_CLI (unlikely)
        dummy, myout, myerr = run_shell_command(CFG_BIBCATALOG_SYSTEM_RT_CLI +
                                                " help")
        helpfound = False
        if myerr.count("help") > 0:
            helpfound = True
        if not helpfound:
            return "Execution of CFG_BIBCATALOG_SYSTEM_RT_CLI " + CFG_BIBCATALOG_SYSTEM_RT_CLI + " help did not produce output 'help'"

        if not CFG_BIBCATALOG_SYSTEM_RT_URL:
            return "CFG_BIBCATALOG_SYSTEM_RT_URL not defined or empty"
        # Construct URL, split RT_URL at //
        if not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('http://') and \
           not CFG_BIBCATALOG_SYSTEM_RT_URL.startswith('https://'):
            return "CFG_BIBCATALOG__SYSTEM_RT_URL does not start with 'http://' or 'https://'"
        httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//")
        # Assemble by http://user:password@RT_URL
        bibcatalog_rt_server = httppart + "//" + rtuid + ":" + rtpw + "@" + siteandpath

        #set as env var
        os.environ["RTUSER"] = rtuid
        os.environ["RTSERVER"] = bibcatalog_rt_server

        #try to talk to RT server
        #this is a safe call since rtpw is the only variable in it, and it is escaped
        rtpw = escape_shell_arg(rtpw)
        dummy, myout, myerr = run_shell_command("echo " + rtpw + " | " +
                                                CFG_BIBCATALOG_SYSTEM_RT_CLI +
                                                " ls \"Subject like 'F00'\"")
        if len(myerr) > 0:
            return "could not connect to " + bibcatalog_rt_server + " " + myerr
        #finally, check that there is some sane output like tickets or 'No matching result'
        saneoutput = (myout.count('matching') > 0) or (myout.count('1') > 0)
        if not saneoutput:
            return CFG_BIBCATALOG_SYSTEM_RT_CLI + " returned " + myout + " instead of 'matching' or '1'"
        return ""
Пример #7
0
 def test_simple_insert(self):
     """batchuploader - robotupload simple insert"""
     from invenio.legacy.search_engine import get_record
     result = urllib2.urlopen(self.req).read()
     self.failUnless("[INFO]" in result)
     current_task = get_last_taskid()
     run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)])
     current_recid = run_sql("SELECT MAX(id) FROM bibrec")[0][0]
     self.failIfEqual(self.last_recid, current_recid)
     record = get_record(current_recid)
     self.assertEqual(record['245'][0][0], [('a', 'The title')])
Пример #8
0
 def test_insert_with_callback(self):
     """batchuploader - robotupload insert with callback"""
     result = urllib2.urlopen(self.req_callback).read()
     self.failUnless("[INFO]" in result, '"%s" did not contained [INFO]' % result)
     current_task = get_last_taskid()
     run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)])
     results = json.loads(open(self.callback_result_path).read())
     self.failUnless('results' in results)
     self.assertEqual(len(results['results']), 1)
     self.failUnless(results['results'][0]['success'])
     self.failUnless(results['results'][0]['recid'] > 0)
     self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
Пример #9
0
 def test_simple_insert(self):
     """batchuploader - robotupload simple insert"""
     if CFG_LOCALHOST_OK:
         from invenio.legacy.search_engine import get_record
         result = urllib2.urlopen(self.req).read()
         self.failUnless("[INFO]" in result)
         current_task = get_last_taskid()
         run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)])
         current_recid = run_sql("SELECT MAX(id) FROM bibrec")[0][0]
         self.failIfEqual(self.last_recid, current_recid)
         record = get_record(current_recid)
         self.assertEqual(record['245'][0][0], [('a', 'The title')])
Пример #10
0
def clean_up(extracted_files_list, image_list):
    """
    Removes all the intermediate stuff.

    @param: extracted_files_list ([string, string, ...]): list of all extracted files
    @param: image_list ([string, string, ...]): list of the images to keep

    """
    for extracted_file in extracted_files_list:
        # Remove everything that is not in the image_list or is not a directory
        if extracted_file not in image_list and extracted_file[-1] != os.sep:
            run_shell_command('rm %s', (extracted_file,))
Пример #11
0
 def test_insert_with_callback(self):
     """batchuploader - robotupload insert with callback"""
     if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK:
         result = urllib2.urlopen(self.req_callback).read()
         self.failUnless("[INFO]" in result, '"%s" did not contained [INFO]' % result)
         current_task = get_last_taskid()
         run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)])
         results = json.loads(open(self.callback_result_path).read())
         self.failUnless('results' in results)
         self.assertEqual(len(results['results']), 1)
         self.failUnless(results['results'][0]['success'])
         self.failUnless(results['results'][0]['recid'] > 0)
         self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
Пример #12
0
 def test_get_defaults(self):
     """plotextractor - get defaults"""
     from invenio.utils.shell import run_shell_command
     from invenio.utils.plotextractor.cli import get_defaults
     sdir_should_be = os.path.join(cfg['CFG_TMPDIR'], self.arXiv_id + '_plots')
     refno_should_be = "15" # Note: For ATLANTIS DEMO site
     sdir, refno = get_defaults(tarball=self.tarball, sdir=None, refno_url=cfg['CFG_SITE_URL'])
     if sdir != None:
         run_shell_command("rm -rf %s" % (sdir,))
     self.assertTrue(sdir == sdir_should_be, \
                      "didn\'t get correct default scratch dir")
     self.assertTrue(refno == refno_should_be, \
                      'didn\'t get correct default reference number')
Пример #13
0
 def test_insert_with_nonce(self):
     """batchuploader - robotupload insert with nonce"""
     if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK:
         result = urllib2.urlopen(self.req_nonce).read()
         self.failUnless("[INFO]" in result, '"%s" did not contained "[INFO]"' % result)
         current_task = get_last_taskid()
         run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)])
         results = json.loads(open(self.callback_result_path).read())
         self.failUnless('results' in results, '"%s" did not contained "results" key' % results)
         self.assertEqual(len(results['results']), 1)
         self.assertEqual(results['nonce'], "1234")
         self.failUnless(results['results'][0]['success'])
         self.failUnless(results['results'][0]['recid'] > 0)
         self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
Пример #14
0
 def test_insert_with_oracle(self):
     """batchuploader - robotupload insert with oracle special treatment"""
     import os
     if os.path.exists('/opt/invenio/var/log/invenio.err'):
         os.remove('/opt/invenio/var/log/invenio.err')
     result = urllib2.urlopen(self.req_oracle).read()
     self.failUnless("[INFO]" in result, '"%s" did not contained "[INFO]"' % result)
     current_task = get_last_taskid()
     run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)])
     results = json.loads(open(self.callback_result_path).read())
     self.failUnless('results' in results, '"%s" did not contained "results" key' % results)
     self.assertEqual(len(results['results']), 1)
     self.failUnless(results['results'][0]['success'])
     self.failUnless(results['results'][0]['recid'] > 0)
     self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
Пример #15
0
def filter_step(obj, eng):
    """Run an external python script."""
    from invenio_records.api import Record
    from invenio.utils.shell import run_shell_command

    repository = obj.extra_data.get("repository", {})
    arguments = repository.get("arguments", {})
    script_name = arguments.get("f_filter-file")
    if script_name:
        marcxml_value = Record(obj.data.dumps()).legacy_export_as_marc()
        extract_path = os.path.join(
            cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
            str(eng.uuid))
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)

        # Now we launch BibUpload tasks for the final MARCXML files
        marcxmlfile = extract_path + os.sep + str(obj.id)
        file_fd = open(marcxmlfile, 'w')
        file_fd.write(marcxml_value)
        file_fd.close()

        exitcode, cmd_stdout, cmd_stderr = run_shell_command(
            cmd="%s '%s'", args=(str(script_name), str(marcxmlfile)))
        if exitcode != 0 or cmd_stderr != "":
            obj.log.error(
                "Error while running filtering script on %s\nError:%s" %
                (marcxmlfile, cmd_stderr))
        else:
            obj.log.info(cmd_stdout)
    else:
        obj.log.error("No script file found!")
Пример #16
0
 def test_run_cmd_hello_quote(self):
     """shellutils - running simple command with an argument with quote"""
     self.assertEqual((0, "hel'lo world\n", ''),
                      run_shell_command("echo %s %s", (
                          "hel'lo",
                          "world",
                      )))
Пример #17
0
 def test_insert_with_oracle(self):
     """batchuploader - robotupload insert with oracle special treatment"""
     if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK:
         import os
         if os.path.exists('/opt/invenio/var/log/invenio.err'):
             os.remove('/opt/invenio/var/log/invenio.err')
         result = urllib2.urlopen(self.req_oracle).read()
         self.failUnless("[INFO]" in result, '"%s" did not contained "[INFO]"' % result)
         current_task = get_last_taskid()
         run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)])
         results = json.loads(open(self.callback_result_path).read())
         self.failUnless('results' in results, '"%s" did not contained "results" key' % results)
         self.assertEqual(len(results['results']), 1)
         self.failUnless(results['results'][0]['success'])
         self.failUnless(results['results'][0]['recid'] > 0)
         self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
Пример #18
0
    def _run_rt_command(self, command, uid=None):
        """
        This function will run a RT CLI command as given user. If no user is specified
        the default RT user will be used, if configured.

        Should any of the configuration parameters be missing this function will return
        None. Otherwise it will return the standard output from the CLI command.

        @param command: RT CLI command to execute
        @type command: string

        @param uid: the Invenio user id to submit on behalf of. Optional.
        @type uid: int

        @return: standard output from the command given. None, if any errors.
        @rtype: string
        """
        if not CFG_BIBCATALOG_SYSTEM_RT_URL:
            return None
        if uid:
            username, passwd = get_bibcat_from_prefs(uid)
        else:
            username = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER
            passwd = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD
        httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//")
        bibcatalog_rt_server = httppart + "//" + username + ":" + passwd + "@" + siteandpath
        # set as env var
        os.environ["RTUSER"] = username
        os.environ["RTSERVER"] = bibcatalog_rt_server
        passwd = escape_shell_arg(passwd)
        error_code, myout, error_output = run_shell_command("echo " + passwd + " | " + command)
        if error_code > 0:
            raise ValueError('Problem running "%s": %d - %s' % (command, error_code, error_output))
        return myout
Пример #19
0
 def test_get_defaults(self):
     """plotextractor - get defaults"""
     from invenio.utils.shell import run_shell_command
     from invenio.utils.plotextractor.cli import get_defaults
     sdir_should_be = os.path.join(cfg['CFG_TMPDIR'],
                                   self.arXiv_id + '_plots')
     refno_should_be = "15"  # Note: For ATLANTIS DEMO site
     sdir, refno = get_defaults(tarball=self.tarball,
                                sdir=None,
                                refno_url=cfg['CFG_SITE_URL'])
     if sdir != None:
         run_shell_command("rm -rf %s" % (sdir, ))
     self.assertTrue(sdir == sdir_should_be, \
                      "didn\'t get correct default scratch dir")
     self.assertTrue(refno == refno_should_be, \
                      'didn\'t get correct default reference number')
Пример #20
0
def get_list_of_all_matching_files(basedir, filetypes):
    """
    This function uses the os module in order tocrawl
    through the directory tree rooted at basedir and find all the files
    therein that include filetype in their 'file' output.  Returns a list
    of absolute paths to all files.

    @param: basedir (string): the directory where we want to start crawling
    @param: filetypes ([string, string]): something that will be contained in
        the output of running 'file' on the types of files we're looking for

    @return: file_paths ([string, string, ...]): a list of full paths to
        the files that we discovered
    """

    file_paths = []

    for dirpath, dummy0, filenames in os.walk(basedir):
        for filename in filenames:
            full_path = os.path.join(dirpath, filename)
            dummy1, cmd_out, dummy2 = run_shell_command(
                'file %s', (full_path, ))
            for filetype in filetypes:
                if cmd_out.find(filetype) > -1:
                    file_paths.append(full_path)

    return file_paths
Пример #21
0
def find_matching_files(basedir, filetypes):
    """Try to find all files matching given filetypes.

    By looking at all the files and filenames in the given directory,
    including subdirectories.

    :param basedir: full path to base directory to search in
    :type basedir: string

    :param filetypes: list of filetypes, extensions
    :type filetypes: list

    :return: exitcode and any error messages as: (exitcode, err_msg)
    :rtype: tuple
    """
    files_list = []
    for dirpath, dummy0, filenames in os.walk(basedir):
        for filename in filenames:
            full_path = os.path.join(dirpath, filename)
            dummy1, cmd_out, dummy2 = run_shell_command(
                'file %s', (full_path, ))
            for filetype in filetypes:
                if cmd_out.lower().find(filetype) > -1:
                    files_list.append(full_path)
                elif filename.split('.')[-1].lower() == filetype:
                    files_list.append(full_path)
    return files_list
Пример #22
0
def _dump_database(dirname, filename):
    """
    Dump Invenio database into SQL file called FILENAME living in
    DIRNAME.
    """
    write_message("... writing %s" % dirname + os.sep + filename)
    cmd = CFG_PATH_MYSQL + 'dump'
    if not os.path.exists(cmd):
        msg = "ERROR: cannot find %s." % cmd
        write_message(msg, stream=sys.stderr)
        raise StandardError(msg)

    cmd += " --skip-opt --add-drop-table --add-locks --create-options " \
           " --quick --extended-insert --set-charset --disable-keys " \
           " --host=%s --user=%s --password=%s %s | %s -c " % \
           (escape_shell_arg(CFG_DATABASE_HOST),
            escape_shell_arg(CFG_DATABASE_USER),
            escape_shell_arg(CFG_DATABASE_PASS),
            escape_shell_arg(CFG_DATABASE_NAME),
            CFG_PATH_GZIP)
    dummy1, dummy2, dummy3 = run_shell_command(cmd, None, dirname + os.sep + filename)
    if dummy1:
        msg = "ERROR: mysqldump exit code is %s." % repr(dummy1)
        write_message(msg, stream=sys.stderr)
        raise StandardError(msg)
    if dummy2:
        msg = "ERROR: mysqldump stdout is %s." % repr(dummy1)
        write_message(msg, stream=sys.stderr)
        raise StandardError(msg)
    if dummy3:
        msg = "ERROR: mysqldump stderr is %s." % repr(dummy1)
        write_message(msg, stream=sys.stderr)
        raise StandardError(msg)
Пример #23
0
def find_matching_files(basedir, filetypes):
    """Try to find all files matching given filetypes.

    By looking at all the files and filenames in the given directory,
    including subdirectories.

    :param basedir: full path to base directory to search in
    :type basedir: string

    :param filetypes: list of filetypes, extensions
    :type filetypes: list

    :return: exitcode and any error messages as: (exitcode, err_msg)
    :rtype: tuple
    """
    files_list = []
    for dirpath, dummy0, filenames in os.walk(basedir):
        for filename in filenames:
            full_path = os.path.join(dirpath, filename)
            dummy1, cmd_out, dummy2 = run_shell_command(
                'file %s', (full_path,)
            )
            for filetype in filetypes:
                if cmd_out.lower().find(filetype) > -1:
                    files_list.append(full_path)
                elif filename.split('.')[-1].lower() == filetype:
                    files_list.append(full_path)
    return files_list
Пример #24
0
def get_list_of_all_matching_files(basedir, filetypes):
    """
    This function uses the os module in order tocrawl
    through the directory tree rooted at basedir and find all the files
    therein that include filetype in their 'file' output.  Returns a list
    of absolute paths to all files.

    @param: basedir (string): the directory where we want to start crawling
    @param: filetypes ([string, string]): something that will be contained in
        the output of running 'file' on the types of files we're looking for

    @return: file_paths ([string, string, ...]): a list of full paths to
        the files that we discovered
    """

    file_paths = []

    for dirpath, dummy0, filenames in os.walk(basedir):
        for filename in filenames:
            full_path = os.path.join(dirpath, filename)
            dummy1, cmd_out, dummy2 = run_shell_command('file %s', (full_path,))
            for filetype in filetypes:
                if cmd_out.find(filetype) > -1:
                    file_paths.append(full_path)

    return file_paths
Пример #25
0
 def test_insert_via_curl(self):
     """batchuploader - robotupload insert via CLI curl"""
     curl_input_file = os.path.join(CFG_TMPDIR, 'curl_test.xml')
     open(curl_input_file, "w").write(self.marcxml)
     try:
         result = run_shell_command('/usr/bin/curl -T %s %s -A %s -H "Content-Type: application/marcxml+xml"', [curl_input_file, self.nonce_url, make_user_agent_string('BatchUploader')])[1]
         self.failUnless("[INFO]" in result)
         current_task = get_last_taskid()
         run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)])
         results = json.loads(open(self.callback_result_path).read())
         self.failUnless('results' in results, '"%s" did not contained [INFO]' % result)
         self.assertEqual(len(results['results']), 1)
         self.assertEqual(results['nonce'], "1234")
         self.failUnless(results['results'][0]['success'])
         self.failUnless(results['results'][0]['recid'] > 0)
         self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
     finally:
         os.remove(curl_input_file)
Пример #26
0
    def _list_quota(cls, directory):
        if not os.path.isdir(directory):
            return None

        (ret, output, err) = run_shell_command(
            "fs listquota {}".format(directory))

        if ret == 0:
            return cls._parse_output(output)
        return None
Пример #27
0
    def _list_quota(cls, directory):
        if not os.path.isdir(directory):
            return None

        (ret, output,
         err) = run_shell_command("fs listquota {}".format(directory))

        if ret == 0:
            return cls._parse_output(output)
        return None
Пример #28
0
def maxpage(f):
    """Returns the number of pages for PDF records via AJAX"""
    cmd_pdftk = '%s %s dump_data output | grep NumberOfPages'
    pdf = get_pdf_path(f)
    if pdf is not None:
        (exit_status, output_std, output_err) = \
            run_shell_command(cmd_pdftk, args=(str(cfg['CFG_PATH_PDFTK']), pdf))
        if int(exit_status) == 0 and len(output_err) == 0:
            return jsonify(maxpage=int(output_std.strip().split(' ')[1]))
    return jsonify(maxpage=-1)
Пример #29
0
 def test_legacy_insert_via_curl(self):
     """batchuploader - robotupload legacy insert via CLI curl"""
     curl_input_file = os.path.join(CFG_TMPDIR, 'curl_test.xml')
     open(curl_input_file, "w").write(self.marcxml)
     try:
         ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload
         code, result, err = run_shell_command("/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s", [curl_input_file, self.callback_url, self.legacy_url, make_user_agent_string('BatchUploader')])
         self.failUnless("[INFO]" in result, '[INFO] not find in results: %s, %s' % (result, err))
         current_task = get_last_taskid()
         run_shell_command("%s/bibupload %%s" % CFG_BINDIR, [str(current_task)])
         results = json.loads(open(self.callback_result_path).read())
         self.failUnless('results' in results, '"%s" did not contained [INFO]' % result)
         self.assertEqual(len(results['results']), 1)
         self.assertEqual(results['nonce'], "1234")
         self.failUnless(results['results'][0]['success'])
         self.failUnless(results['results'][0]['recid'] > 0)
         self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
     finally:
         os.remove(curl_input_file)
Пример #30
0
def maxpage(f):
    """Return number of pages for PDF records via AJAX."""
    cmd_pdftk = "%s %s dump_data output | grep NumberOfPages"
    pdf = get_pdf_path(f)
    if pdf is not None:
        (exit_status, output_std, output_err) = \
            run_shell_command(cmd_pdftk, args=(str(cfg["CFG_PATH_PDFTK"]), pdf))
        if int(exit_status) == 0 and len(output_err) == 0:
            return jsonify(maxpage=int(output_std.strip().split(" ")[1]))
    return jsonify(maxpage=-1)
Пример #31
0
 def test_run_cmd_timeout_no_zombie(self):
     """shellutils - running simple command no zombie"""
     self.assertRaises(Timeout,
                       run_process_with_timeout,
                       (self.script_path, '15', "THISISATEST"),
                       timeout=5)
     ps_output = run_shell_command('ps aux')[1]
     self.failIf('THISISATEST' in ps_output,
                 '"THISISATEST" was found in %s' % ps_output)
     self.failIf('sleep 15' in ps_output,
                 '"sleep 15" was found in %s' % ps_output)
Пример #32
0
 def test_legacy_insert_via_curl(self):
     """batchuploader - robotupload legacy insert via CLI curl"""
     if cfg['CFG_DEVEL_SITE'] and CFG_LOCALHOST_OK:
         if CFG_HAS_CURL:
             curl_input_file = os.path.join(cfg['CFG_TMPDIR'], 'curl_test.xml')
             open(curl_input_file, "w").write(self.marcxml)
             try:
                 ## curl -F '[email protected]' -F 'mode=-i' [-F 'callback_url=http://...'] [-F 'nonce=1234'] http://cds.cern.ch/batchuploader/robotupload -A invenio_webupload
                 code, result, err = run_shell_command("/usr/bin/curl -v -F file=@%s -F 'mode=-i' -F callback_url=%s -F nonce=1234 %s -A %s", [curl_input_file, self.callback_url, self.legacy_url, make_user_agent_string('BatchUploader')])
                 self.failUnless("[INFO]" in result, '[INFO] not find in results: %s, %s' % (result, err))
                 current_task = get_last_taskid()
                 run_shell_command("%s/bibupload %%s" % cfg['CFG_BINDIR'], [str(current_task)])
                 results = json.loads(open(self.callback_result_path).read())
                 self.failUnless('results' in results, '"%s" did not contained [INFO]' % result)
                 self.assertEqual(len(results['results']), 1)
                 self.assertEqual(results['nonce'], "1234")
                 self.failUnless(results['results'][0]['success'])
                 self.failUnless(results['results'][0]['recid'] > 0)
                 self.failUnless("""<subfield code="a">Doe, John</subfield>""" in results['results'][0]['marcxml'], results['results'][0]['marcxml'])
             finally:
                 os.remove(curl_input_file)
Пример #33
0
def load_submission(doctype, dump, method=None):
    "Insert submission into database. Return tuple(error code, msg)"
    # NOT TESTED
    messages = []

    def guess_dump_method(dump):
        """Guess which method was used to dump this file (i.e. if it contains all the submission rows or not)"""
        match_obj = re_method_pattern.search(dump)
        if match_obj:
            return match_obj.group('method')
        else:
            return None

    def guess_dump_has_delete_statements(dump):
        """Guess if given submission dump already contain delete statements"""
        return "DELETE FROM sbmDOCTYPE WHERE sdocname".lower() in dump.lower()

    if not method:
        method = guess_dump_method(dump)
        if method is None:
            method = CFG_WEBSUBMIT_DUMPER_DEFAULT_METHOD
            messages.append(
                "WARNING: method could not be guessed. Using method %s" %
                method)
        else:
            messages.append("Used method %s to load data" % method)

    (dump_code, dump_path) = tempfile.mkstemp(prefix=doctype, dir=CFG_TMPDIR)
    dump_fd = open(dump_path, 'w')
    dump_fd.write(dump)
    dump_fd.close()

    # We need to remove the submission. But let's create a backup first.
    submission_backup = dump_submission(doctype, method)
    submission_backup_path = "%s_db_dump%s.sql" % (
        doctype, time.strftime("%Y%m%d_%H%M%S"))
    fd = file(os.path.join(CFG_TMPDIR, submission_backup_path), "w")
    fd.write(submission_backup)
    fd.close()
    if not guess_dump_has_delete_statements(dump):
        remove_submission(doctype, method)

    # Load the dump
    (exit_code, out_msg,
     err_msg) = run_shell_command("%s/bin/dbexec < %s",
                                  (CFG_PREFIX, os.path.abspath(dump_path)))
    if exit_code:
        messages.append("ERROR: failed to load submission:" + err_msg)
        return (1, messages)

    messages.append("Submission loaded. Previous submission saved to %s" %
                    os.path.join(CFG_TMPDIR, submission_backup_path))
    return (0, messages)
Пример #34
0
def get_text_snippets(textfile_path, patterns, nb_chars, max_snippets):
    """
    Extract text snippets around 'patterns' from the file found at
    'textfile_path'. The snippets are meant to look similar to results of
    popular Internet search engines: using " ... " between snippets.
    For empty patterns it returns ""
    """
    # TODO: - distinguish the beginning of sentences and make the snippets
    #         start there
    #       - optimize finding patterns - first search for patterns apperaing next
    #         to each other, secondly look for each patten not for first
    #         occurances of any pattern

    if len(patterns) == 0:
        return ""

    max_lines = nb_chars / 40 + 2  # rule of thumb in order to catch nb_chars
    # Produce the big snippets from which the real snippets will be cut out
    cmd = "grep -i -C%s -m%s"
    cmdargs = [str(max_lines), str(max_snippets)]
    for p in patterns:
        cmd += " -e %s"
        cmdargs.append(" " + p)
    cmd += " %s"
    cmdargs.append(textfile_path)
    (dummy1, output, dummy2) = run_shell_command(cmd, cmdargs)
    # a fact to keep in mind with this call to grep is that if patterns appear
    # in two contigious lines, they will not be separated by '--' and therefore
    # treated as one 'big snippet'
    result = []
    big_snippets = output.split("--")

    # cut the snippets to match the nb_words_around parameter precisely:
    for s in big_snippets:
        small_snippet = cut_out_snippet(s, patterns, nb_chars)
        result.append(small_snippet)

    # combine snippets
    out = ""
    count = 0
    for snippet in result:
        if snippet and count < max_snippets:
            if out:
                out += "..."
            out += highlight(snippet, patterns, whole_word_matches=True)

    return out
Пример #35
0
def get_text_snippets(textfile_path, patterns, nb_chars, max_snippets):
    """
    Extract text snippets around 'patterns' from the file found at
    'textfile_path'. The snippets are meant to look similar to results of
    popular Internet search engines: using " ... " between snippets.
    For empty patterns it returns ""
    """
    # TODO: - distinguish the beginning of sentences and make the snippets
    #         start there
    #       - optimize finding patterns - first search for patterns apperaing next
    #         to each other, secondly look for each patten not for first
    #         occurances of any pattern

    if len(patterns) == 0:
        return ""

    max_lines = nb_chars / 40 + 2  # rule of thumb in order to catch nb_chars
    # Produce the big snippets from which the real snippets will be cut out
    cmd = "grep -i -C%s -m%s"
    cmdargs = [str(max_lines), str(max_snippets)]
    for p in patterns:
        cmd += " -e %s"
        cmdargs.append(" " + p)
    cmd += " %s"
    cmdargs.append(textfile_path)
    (dummy1, output, dummy2) = run_shell_command(cmd, cmdargs)
    # a fact to keep in mind with this call to grep is that if patterns appear
    # in two contigious lines, they will not be separated by '--' and therefore
    # treated as one 'big snippet'
    result = []
    big_snippets = output.split("--")

    # cut the snippets to match the nb_words_around parameter precisely:
    for s in big_snippets:
        small_snippet = cut_out_snippet(s, patterns, nb_chars)
        result.append(small_snippet)

    # combine snippets
    out = ""
    count = 0
    for snippet in result:
        if snippet and count < max_snippets:
            if out:
                out += "..."
            out += highlight(snippet, patterns, whole_word_matches=True)

    return out
Пример #36
0
def _grep_version_from_executable(path_to_exec, version_regexp):
    """Try to detect a program version.

    Grep in its binary PATH_TO_EXEC and looking for VERSION_REGEXP.  Return
    program version as a string.  Return empty string if not succeeded.
    """
    from invenio.utils.shell import run_shell_command
    exec_version = ""
    if os.path.exists(path_to_exec):
        dummy1, cmd2_out, dummy2 = run_shell_command(
            "strings %s | grep %s", (path_to_exec, version_regexp))
        if cmd2_out:
            for cmd2_out_line in cmd2_out.split("\n"):
                if len(cmd2_out_line) > len(exec_version):
                    # the longest the better
                    exec_version = cmd2_out_line
    return exec_version
Пример #37
0
def _grep_version_from_executable(path_to_exec, version_regexp):
    """Try to detect a program version.

    Grep in its binary PATH_TO_EXEC and looking for VERSION_REGEXP.  Return
    program version as a string.  Return empty string if not succeeded.
    """
    from invenio.utils.shell import run_shell_command
    exec_version = ""
    if os.path.exists(path_to_exec):
        dummy1, cmd2_out, dummy2 = run_shell_command(
            "strings %s | grep %s", (path_to_exec, version_regexp))
        if cmd2_out:
            for cmd2_out_line in cmd2_out.split("\n"):
                if len(cmd2_out_line) > len(exec_version):
                    # the longest the better
                    exec_version = cmd2_out_line
    return exec_version
Пример #38
0
def load_submission(doctype, dump, method=None):
    "Insert submission into database. Return tuple(error code, msg)"
    # NOT TESTED
    messages = []
    def guess_dump_method(dump):
        """Guess which method was used to dump this file (i.e. if it contains all the submission rows or not)"""
        match_obj = re_method_pattern.search(dump)
        if match_obj:
            return match_obj.group('method')
        else:
            return None

    def guess_dump_has_delete_statements(dump):
        """Guess if given submission dump already contain delete statements"""
        return "DELETE FROM sbmDOCTYPE WHERE sdocname".lower() in dump.lower()

    if not method:
        method = guess_dump_method(dump)
        if method is None:
            method = CFG_WEBSUBMIT_DUMPER_DEFAULT_METHOD
            messages.append("WARNING: method could not be guessed. Using method %s" % method)
        else:
            messages.append("Used method %s to load data" % method)

    (dump_code, dump_path) = tempfile.mkstemp(prefix=doctype, dir=CFG_TMPDIR)
    dump_fd = open(dump_path, 'w')
    dump_fd.write(dump)
    dump_fd.close()

    # We need to remove the submission. But let's create a backup first.
    submission_backup = dump_submission(doctype, method)
    submission_backup_path = "%s_db_dump%s.sql" % (doctype, time.strftime("%Y%m%d_%H%M%S"))
    fd = file(os.path.join(CFG_TMPDIR, submission_backup_path), "w")
    fd.write(submission_backup)
    fd.close()
    if not guess_dump_has_delete_statements(dump):
        remove_submission(doctype, method)

    # Load the dump
    (exit_code, out_msg, err_msg) = run_shell_command("%s/bin/dbexec < %s", (CFG_PREFIX, os.path.abspath(dump_path)))
    if exit_code:
        messages.append("ERROR: failed to load submission:" + err_msg)
        return (1, messages)

    messages.append("Submission loaded. Previous submission saved to %s" % os.path.join(CFG_TMPDIR, submission_backup_path))
    return (0, messages)
Пример #39
0
def version(separator='\n', formatting='{version} [{executable}]'):
    """
    Try to detect Apache version by localizing httpd or apache
    executables and grepping inside binaries.  Return list of all
    found Apache versions and paths.  (For a given executable, the
    returned format is 'apache_version [apache_path]'.)  Return empty
    list if no success.
    """
    from invenio.utils.shell import run_shell_command
    out = []
    dummy1, cmd_out, dummy2 = run_shell_command("locate bin/httpd bin/apache")
    for apache in cmd_out.split("\n"):
        apache_version = _grep_version_from_executable(apache, '^Apache\/')
        if apache_version:
            out.append(
                formatting.format(version=apache_version, executable=apache))
    if separator is None:
        return out
    return separator.join(out)
Пример #40
0
def version(separator='\n', formatting='{version} [{executable}]'):
    """
    Try to detect Apache version by localizing httpd or apache
    executables and grepping inside binaries.  Return list of all
    found Apache versions and paths.  (For a given executable, the
    returned format is 'apache_version [apache_path]'.)  Return empty
    list if no success.
    """
    from invenio.utils.shell import run_shell_command
    out = []
    dummy1, cmd_out, dummy2 = run_shell_command("locate bin/httpd bin/apache")
    for apache in cmd_out.split("\n"):
        apache_version = _grep_version_from_executable(apache, '^Apache\/')
        if apache_version:
            out.append(formatting.format(version=apache_version,
                                         executable=apache))
    if separator is None:
        return out
    return separator.join(out)
Пример #41
0
def convert_images(image_list):
    """
    Here we figure out the types of the images that were extracted from
    the tarball and determine how to convert them into PNG.

    @param: image_list ([string, string, ...]): the list of image files
        extracted from the tarball in step 1

    @return: image_list ([str, str, ...]): The list of image files when all
        have been converted to PNG format.
    """
    png_output_contains = 'PNG image'
    ret_list = []
    for image_file in image_list:
        if os.path.isdir(image_file):
            continue

        # FIXME: here and everywhere else in the plot extractor
        # library the run shell command statements should be (1)
        # called with timeout in order to prevent runaway imagemagick
        # conversions; (2) the arguments should be passed properly so
        # that they are escaped.

        dummy1, cmd_out, dummy2 = run_shell_command('file %s', (image_file,))
        if cmd_out.find(png_output_contains) > -1:
            ret_list.append(image_file)
        else:
            # we're just going to assume that ImageMagick can convert all
            # the image types that we may be faced with
            # for sure it can do EPS->PNG and JPG->PNG and PS->PNG
            # and PSTEX->PNG
            converted_image_file = get_converted_image_name(image_file)
            cmd_list = ['convert', image_file, converted_image_file]
            try:
                dummy1, cmd_out, cmd_err = run_process_with_timeout(cmd_list)
                if cmd_err == '' or os.path.exists(converted_image_file):
                    ret_list.append(converted_image_file)
                else:
                    write_message('convert failed on ' + image_file)
            except Timeout:
                write_message('convert timed out on ' + image_file)

    return ret_list
Пример #42
0
def convert_images(image_list):
    """
    Here we figure out the types of the images that were extracted from
    the tarball and determine how to convert them into PNG.

    @param: image_list ([string, string, ...]): the list of image files
        extracted from the tarball in step 1

    @return: image_list ([str, str, ...]): The list of image files when all
        have been converted to PNG format.
    """
    png_output_contains = 'PNG image'
    ret_list = []
    for image_file in image_list:
        if os.path.isdir(image_file):
            continue

        # FIXME: here and everywhere else in the plot extractor
        # library the run shell command statements should be (1)
        # called with timeout in order to prevent runaway imagemagick
        # conversions; (2) the arguments should be passed properly so
        # that they are escaped.

        dummy1, cmd_out, dummy2 = run_shell_command('file %s', (image_file, ))
        if cmd_out.find(png_output_contains) > -1:
            ret_list.append(image_file)
        else:
            # we're just going to assume that ImageMagick can convert all
            # the image types that we may be faced with
            # for sure it can do EPS->PNG and JPG->PNG and PS->PNG
            # and PSTEX->PNG
            converted_image_file = get_converted_image_name(image_file)
            cmd_list = ['convert', image_file, converted_image_file]
            try:
                dummy1, cmd_out, cmd_err = run_process_with_timeout(cmd_list)
                if cmd_err == '':
                    ret_list.append(converted_image_file)
                else:
                    write_message('convert failed on ' + image_file)
            except Timeout:
                write_message('convert timed out on ' + image_file)

    return ret_list
Пример #43
0
def read_metadata_local(inputfile, verbose):
    """
    Metadata extraction from many kind of files

    @param inputfile: path to the image
    @type inputfile: string
    @param verbose: verbosity
    @type verbose: int
    @rtype: dict
    @return: dictionary with metadata
    """
    cmd = CFG_PATH_PDFTK + ' %s dump_data'
    (exit_status, output_std, output_err) = \
                      run_shell_command(cmd, args=(inputfile,))
    metadata_dict = {}
    key = None
    value = None
    for metadata_line in output_std.splitlines():
        if metadata_line.strip().startswith("InfoKey"):
            key = metadata_line.split(':', 1)[1].strip()
        elif metadata_line.strip().startswith("InfoValue"):
            value = metadata_line.split(':', 1)[1].strip()
            if key in ["ModDate", "CreationDate"]:
                # FIXME: Interpret these dates?
                try:
                    pass
                    #value = datetime.strptime(value, "D:%Y%m%d%H%M%S%Z")
                except:
                    pass
            if key:
                metadata_dict[key] = value
                key = None
        else:
            try:
                custom_key, custom_value = metadata_line.split(':', 1)
                metadata_dict[custom_key.strip()] = custom_value.strip()
            except:
                # Most probably not relevant line
                pass

    return metadata_dict
Пример #44
0
def read_metadata_local(inputfile, verbose):
    """
    Metadata extraction from many kind of files

    @param inputfile: path to the image
    @type inputfile: string
    @param verbose: verbosity
    @type verbose: int
    @rtype: dict
    @return: dictionary with metadata
    """
    cmd = CFG_PATH_PDFTK + ' %s dump_data'
    (exit_status, output_std, output_err) = \
                      run_shell_command(cmd, args=(inputfile,))
    metadata_dict = {}
    key = None
    value = None
    for metadata_line in output_std.splitlines():
        if metadata_line.strip().startswith("InfoKey"):
            key = metadata_line.split(':', 1)[1].strip()
        elif metadata_line.strip().startswith("InfoValue"):
            value = metadata_line.split(':', 1)[1].strip()
            if key in ["ModDate", "CreationDate"]:
                # FIXME: Interpret these dates?
                try:
                    pass
                    #value = datetime.strptime(value, "D:%Y%m%d%H%M%S%Z")
                except:
                    pass
            if key:
                metadata_dict[key] = value
                key = None
        else:
            try:
                custom_key, custom_value = metadata_line.split(':', 1)
                metadata_dict[custom_key.strip()] = custom_value.strip()
            except:
                # Most probably not relevant line
                pass

    return metadata_dict
Пример #45
0
def check_for_gzip(tfile):
    """
    Was that tarball also gzipped?  Let's find out!

    @param: file (string): the name of the object (so we can gunzip, if
        that's necessary)

    @output: a gunzipped file in the directory of choice, if that's necessary

    @return new_file (string): The name of the file after gunzipping or the
        original name of the file if that wasn't necessary
    """

    gzip_contains = 'gzip compressed data'
    dummy1, cmd_out, dummy2 = run_shell_command('file %s', (tfile, ))

    if cmd_out.find(gzip_contains) > -1:
        # we have a gzip!
        # so gzip is retarded and won't accept any file that doesn't end
        # with .gz.  sad.
        run_shell_command('cp %s %s', (tfile, tfile + '.tar.gz'))
        new_dest = os.path.join(os.path.split(tfile)[0], 'tmp.tar')
        run_shell_command('touch %s', (new_dest, ))
        dummy1, cmd_out, cmd_err = run_shell_command('gunzip -c %s',
                                                     (tfile + '.tar.gz', ))
        if cmd_err != '':
            write_message('Error while gunzipping ' + tfile)
            return tfile

        tarfile = open(new_dest, 'w')
        tarfile.write(cmd_out)
        tarfile.close()

        run_shell_command('rm %s', (tfile + '.tar.gz', ))
        return new_dest

    return tfile
Пример #46
0
def check_for_gzip(tfile):
    """
    Was that tarball also gzipped?  Let's find out!

    @param: file (string): the name of the object (so we can gunzip, if
        that's necessary)

    @output: a gunzipped file in the directory of choice, if that's necessary

    @return new_file (string): The name of the file after gunzipping or the
        original name of the file if that wasn't necessary
    """

    gzip_contains = 'gzip compressed data'
    dummy1, cmd_out, dummy2 = run_shell_command('file %s', (tfile,))

    if cmd_out.find(gzip_contains) > -1:
        # we have a gzip!
        # so gzip is retarded and won't accept any file that doesn't end
        # with .gz.  sad.
        run_shell_command('cp %s %s', (tfile, tfile + '.tar.gz'))
        new_dest = os.path.join(os.path.split(tfile)[0], 'tmp.tar')
        run_shell_command('touch %s', (new_dest,))
        dummy1, cmd_out, cmd_err = run_shell_command('gunzip -c %s',
                                                            (tfile + '.tar.gz',))
        if cmd_err != '':
            write_message('Error while gunzipping ' + tfile)
            return tfile

        tarfile = open(new_dest, 'w')
        tarfile.write(cmd_out)
        tarfile.close()

        run_shell_command('rm %s', (tfile + '.tar.gz',))
        return new_dest

    return tfile
Пример #47
0
    def _find_and_run_js_test_files():
        """Find and run all the JavaScript files.

        Find all JS files installed in Invenio lib directory and run
        them on the JsTestDriver server
        """
        from invenio.utils.shell import run_shell_command
        errors_found = 0
        for candidate in os.listdir(CFG_WEBDIR + "/js"):
            base, ext = os.path.splitext(candidate)

            if ext != '.js' or not base.endswith('_tests'):
                continue

            print("Found test file %s. Running tests... " % (base + ext))
            exitcode_, stdout, stderr_ = run_shell_command(
                cmd="java -jar %s/JsTestDriver.jar --config %s --tests all" %
                (CFG_PREFIX + "/lib/java/js-test-driver",
                 CFG_WEBDIR + "/js/" + base + '.conf'))
            print(stdout)
            if "Fails: 0" not in stdout:
                errors_found += 1
        print(errors_found)
        return errors_found
Пример #48
0
    def _find_and_run_js_test_files():
        """Find and run all the JavaScript files.

        Find all JS files installed in Invenio lib directory and run
        them on the JsTestDriver server
        """
        from invenio.utils.shell import run_shell_command
        errors_found = 0
        for candidate in os.listdir(CFG_WEBDIR + "/js"):
            base, ext = os.path.splitext(candidate)

            if ext != '.js' or not base.endswith('_tests'):
                continue

            print("Found test file %s. Running tests... " % (base + ext))
            exitcode_, stdout, stderr_ = run_shell_command(
                cmd="java -jar %s/JsTestDriver.jar --config %s --tests all" %
                (CFG_PREFIX + "/lib/java/js-test-driver",
                 CFG_WEBDIR + "/js/" + base + '.conf'))
            print(stdout)
            if "Fails: 0" not in stdout:
                errors_found += 1
        print(errors_found)
        return errors_found
Пример #49
0
    def _run_rt_command(self, command, uid=None):
        """
        This function will run a RT CLI command as given user. If no user is specified
        the default RT user will be used, if configured.

        Should any of the configuration parameters be missing this function will return
        None. Otherwise it will return the standard output from the CLI command.

        @param command: RT CLI command to execute
        @type command: string

        @param uid: the Invenio user id to submit on behalf of. Optional.
        @type uid: int

        @return: standard output from the command given. None, if any errors.
        @rtype: string
        """
        if not CFG_BIBCATALOG_SYSTEM_RT_URL:
            return None
        if uid:
            username, passwd = get_bibcat_from_prefs(uid)
        else:
            username = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_USER
            passwd = CFG_BIBCATALOG_SYSTEM_RT_DEFAULT_PWD
        httppart, siteandpath = CFG_BIBCATALOG_SYSTEM_RT_URL.split("//")
        bibcatalog_rt_server = httppart + "//" + username + ":" + passwd + "@" + siteandpath
        #set as env var
        os.environ["RTUSER"] = username
        os.environ["RTSERVER"] = bibcatalog_rt_server
        passwd = escape_shell_arg(passwd)
        error_code, myout, error_output = run_shell_command("echo " + passwd +
                                                            " | " + command)
        if error_code > 0:
            raise ValueError('Problem running "%s": %d - %s' %
                             (command, error_code, error_output))
        return myout
Пример #50
0
def filter_step(obj, eng):
    """Run an external python script."""
    from invenio.modules.records.api import Record
    from invenio.utils.shell import run_shell_command

    repository = obj.extra_data.get("repository", {})
    arguments = repository.get("arguments", {})
    script_name = arguments.get("f_filter-file")
    if script_name:
        marcxml_value = Record(obj.data.dumps()).legacy_export_as_marc()
        extract_path = os.path.join(
            cfg['CFG_TMPSHAREDDIR'],
            str(eng.uuid)
        )
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)

        # Now we launch BibUpload tasks for the final MARCXML files
        marcxmlfile = extract_path + os.sep + str(obj.id)
        file_fd = open(marcxmlfile, 'w')
        file_fd.write(marcxml_value)
        file_fd.close()

        exitcode, cmd_stdout, cmd_stderr = run_shell_command(
            cmd="%s '%s'",
            args=(str(script_name),
                  str(marcxmlfile)))
        if exitcode != 0 or cmd_stderr != "":
            obj.log.error(
                "Error while running filtering script on %s\nError:%s"
                % (marcxmlfile, cmd_stderr)
            )
        else:
            obj.log.info(cmd_stdout)
    else:
        obj.log.error("No script file found!")
Пример #51
0
    def _plot_extract(obj, eng):
        """
        Performs the plotextraction step.
        """
        # Download tarball for each harvested/converted record, then run plotextrator.
        # Update converted xml files with generated xml or add it for upload
        bibtask.task_sleep_now_if_required()
        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}

        if 'p_extraction-source' not in obj.extra_data["_repository"]["arguments"]:
            p_extraction_source = plotextractor_types
        else:
            p_extraction_source = obj.extra_data["_repository"]["arguments"]['p_extraction-source']

        if not isinstance(p_extraction_source, list):
            p_extraction_source = [p_extraction_source]

        if 'latex' in p_extraction_source:
            # Run LaTeX plotextractor
            if "tarball" not in obj.extra_data["_result"]:
                # turn oaiharvest_23_1_20110214161632_converted -> oaiharvest_23_1_material
                # to let harvested material in same folder structure
                extract_path = plotextractor_getter.make_single_directory(cfg['CFG_TMPSHAREDDIR'], eng.uuid)
                tarball, pdf = plotextractor_getter.harvest_single(obj.data["system_number_external"]["value"], extract_path, ["tarball"])
                tarball = str(tarball)
                if tarball is None:
                    raise workflows_error.WorkflowError(str("Error harvesting tarball from id: %s %s" %
                                                        (obj.data["system_number_external"]["value"], extract_path)),
                                                        eng.uuid,
                                                        id_object=obj.id)

                obj.extra_data["_result"]["tarball"] = tarball
            else:
                tarball = obj.extra_data["_result"]["tarball"]

            sub_dir, refno = get_defaults(tarball, cfg['CFG_TMPDIR'], "")

            tex_files = None
            image_list = None
            try:
                extracted_files_list, image_list, tex_files = untar(tarball, sub_dir)
            except Timeout:
                eng.log.error('Timeout during tarball extraction on %s' % (tarball,))

            converted_image_list = convert_images(image_list)
            eng.log.info('converted %d of %d images found for %s' % (len(converted_image_list),
                                                                     len(image_list),
                                                                     os.path.basename(tarball)))
            extracted_image_data = []
            if tex_files == [] or tex_files is None:
                eng.log.error('%s is not a tarball' % (os.path.split(tarball)[-1],))
                run_shell_command('rm -r %s', (sub_dir,))
            else:
                for tex_file in tex_files:
                    # Extract images, captions and labels
                    partly_extracted_image_data = extract_captions(tex_file, sub_dir,
                                                                   converted_image_list)
                    if partly_extracted_image_data:
                        # Add proper filepaths and do various cleaning
                        cleaned_image_data = prepare_image_data(partly_extracted_image_data,
                                                                tex_file, converted_image_list)
                        # Using prev. extracted info, get contexts for each image found
                        extracted_image_data.extend((extract_context(tex_file, cleaned_image_data)))

            if extracted_image_data:
                extracted_image_data = remove_dups(extracted_image_data)
                create_contextfiles(extracted_image_data)
                marc_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n'
                marc_xml += create_MARC(extracted_image_data, tarball, None)
                marc_xml += "\n</collection>"

                if marc_xml:
                    # We store the path to the directory  the tarball contents live
                    # Read and grab MARCXML from plotextractor run
                    new_dict_representation = records_api.create_record(marc_xml,
                                                                        master_format="marc").dumps()
                    try:
                        obj.data['fft'].append(new_dict_representation["fft"])
                    except KeyError:
                        obj.data['fft'] = [new_dict_representation['fft']]
                    obj.add_task_result("filesfft", new_dict_representation["fft"])
                    obj.add_task_result("number_picture_converted", len(converted_image_list))
                    obj.add_task_result("number_of_picture_total", len(image_list))
Пример #52
0
def write_metadata_local(inputfile, outputfile, metadata_dictionary, verbose):
    """
    Metadata write method, takes the .pdf as input and creates a new
    one with the new info.

    @param inputfile: path to the pdf
    @type inputfile: string
    @param outputfile: path to the resulting pdf
    @type outputfile: string
    @param verbose: verbosity
    @type verbose: int
    @param metadata_dictionary: metadata information to update inputfile
    @type metadata_dictionary: dict
    """
    # Take the file name (0 base, 1 name, 2 ext)
    filename = decompose_file(inputfile)[1]

    # Print pdf metadata
    if verbose > 1:
        print('Metadata information in the PDF file ' + filename + ': \n')
        try:
            os.system(CFG_PATH_PDFTK + ' ' + inputfile + ' dump_data')
        except Exception:
            print('Problem with inputfile to PDFTK')

    # Info file for pdftk
    (fd, path_to_info) = tempfile.mkstemp(prefix="wsm_pdf_plugin_info_", \
                                             dir=CFG_TMPDIR)
    os.close(fd)
    file_in = open(path_to_info, 'w')
    if verbose > 5:
        print("Saving PDFTK info file to %s" % path_to_info)

    # User interaction to form the info file
    # Main Case: Dictionary received through option -d
    if not metadata_dictionary == {}:
        for tag in metadata_dictionary:
            line = 'InfoKey: ' + tag + '\nInfoValue: ' + \
                   metadata_dictionary[tag] + '\n'
            if verbose > 0:
                print(line)
            file_in.writelines(line)
    else:
        data_modified = False
        user_input = 'user_input'
        print("Entering interactive mode. Choose what you want to do:")
        while (user_input):
            if not data_modified:
                try:
                    user_input = raw_input('[w]rite / [q]uit\n')
                except:
                    print("Aborting")
                    return
            else:
                try:
                    user_input = raw_input(
                        '[w]rite / [q]uit and apply / [a]bort \n')
                except:
                    print("Aborting")
                    return
            if user_input == 'q':
                if not data_modified:
                    return
                break
            elif user_input == 'w':
                try:
                    tag = raw_input('Tag to update:\n')
                    value = raw_input('With value:\n')
                except:
                    print("Aborting")
                    return
                # Write to info file
                line = 'InfoKey: ' + tag + '\nInfoValue: ' + value + '\n'
                data_modified = True
                file_in.writelines(line)
            elif user_input == 'a':
                return
            else:
                print("Invalid option: ")
    file_in.close()

    (fd, pdf_temp_path) = tempfile.mkstemp(prefix="wsm_pdf_plugin_pdf_", \
                                              dir=CFG_TMPDIR)
    os.close(fd)

    # Now we call pdftk tool to update the info on a pdf
    #try:
    cmd_pdftk = '%s %s update_info %s output %s'
    (exit_status, output_std, output_err) = \
                  run_shell_command(cmd_pdftk,
                                    args=(CFG_PATH_PDFTK, inputfile,
                                          path_to_info, pdf_temp_path))
    if verbose > 5:
        print(output_std, output_err)

    if os.path.exists(pdf_temp_path):
        # Move to final destination if exist
        try:
            shutil.move(pdf_temp_path, outputfile)
        except Exception as err:
            raise InvenioWebSubmitFileMetadataRuntimeError("Could not move %s to %s" % \
                                                           (pdf_temp_path, outputfile))
    else:
        # Something bad happened
        raise InvenioWebSubmitFileMetadataRuntimeError(
            "Could not update metadata " + output_err)
Пример #53
0
def untar(original_tarball, sdir):
    """
    Here we decide if our file is actually a tarball (sometimes the
    'tarballs' gotten from arXiv aren't actually tarballs.  If they
    'contain' only the TeX file, then they are just that file.), then
    we untar it if so and decide which of its constituents are the
    TeX file and which are the images.

    @param: tarball (string): the name of the tar file from arXiv
    @param: dir (string): the directory where we would like it untarred to

    @return: (image_list, tex_file) (([string, string, ...], string)):
        list of images in the tarball and the name of the TeX file in the
        tarball.
    """

    tarball = check_for_gzip(original_tarball)
    dummy1, cmd_out, cmd_err = run_shell_command('file %s', (tarball, ))
    tarball_output = 'tar archive'
    if re.search(tarball_output, cmd_out) == None:
        run_shell_command('rm %s', (tarball, ))
        return ([], [], None)
    cmd_list = ['tar', 'xvf', tarball, '-C', sdir]
    dummy1, cmd_out, cmd_err = run_process_with_timeout(cmd_list)

    if cmd_err != '':
        return ([], [], None)
    if original_tarball != tarball:
        run_shell_command('rm %s', (tarball, ))
    cmd_out = cmd_out.split('\n')

    tex_output_contains = 'TeX'

    tex_file_extension = 'tex'
    image_output_contains = 'image'
    eps_output_contains = '- type eps'
    ps_output_contains = 'Postscript'

    file_list = []
    image_list = []
    might_be_tex = []

    for extracted_file in cmd_out:
        if extracted_file == '':
            break
        if extracted_file.startswith('./'):
            extracted_file = extracted_file[2:]
        # ensure we are actually looking at the right file
        extracted_file = os.path.join(sdir, extracted_file)

        # Add to full list of extracted files
        file_list.append(extracted_file)

        dummy1, cmd_out, dummy2 = run_shell_command('file %s',
                                                    (extracted_file, ))

        # is it TeX?
        if cmd_out.find(tex_output_contains) > -1:
            might_be_tex.append(extracted_file)

        # is it an image?
        elif cmd_out.lower().find(image_output_contains) > cmd_out.find(':') \
                or \
                cmd_out.lower().find(eps_output_contains) > cmd_out.find(':')\
                or \
                cmd_out.find(ps_output_contains) > cmd_out.find(':'):
            # we have "image" in the output, and it is not in the filename
            # i.e. filename.ext: blah blah image blah blah
            image_list.append(extracted_file)

        # if neither, maybe it is TeX or an image anyway, otherwise,
        # we don't care
        else:
            if extracted_file.split('.')[-1].lower() == tex_file_extension:
                # we might have tex source!
                might_be_tex.append(extracted_file)
            elif extracted_file.split('.')[-1] in ['eps', 'png', \
                    'ps', 'jpg', 'pdf']:
                # we might have an image!
                image_list.append(extracted_file)

    if might_be_tex == []:
        # well, that's tragic
        # could not find TeX file in tar archive
        return ([], [], [])

    return (file_list, image_list, might_be_tex)
Пример #54
0
def cli_upload(req, file_content=None, mode=None, callback_url=None, nonce=None, special_treatment=None):
    """ Robot interface for uploading MARC files
    """
    req.content_type = "text/plain"

    # check IP and useragent:
    if not _get_client_authorized_collections(_get_client_ip(req)):
        msg = "[ERROR] Sorry, client IP %s cannot use the service." % _get_client_ip(req)
        _log(msg)
        req.status = HTTP_FORBIDDEN
        return _write(req, msg)
    if not _check_client_useragent(req):
        msg = "[ERROR] Sorry, the %s useragent cannot use the service." % _get_useragent(req)
        _log(msg)
        req.status = HTTP_FORBIDDEN
        return _write(req, msg)

    arg_mode = mode
    if not arg_mode:
        msg = "[ERROR] Please specify upload mode to use."
        _log(msg)
        req.status = HTTP_BAD_REQUEST
        return _write(req, msg)
    if arg_mode == '--insertorreplace':
        arg_mode = '-ir'
    if not arg_mode in PERMITTED_MODES:
        msg = "[ERROR] Invalid upload mode."
        _log(msg)
        req.status = HTTP_BAD_REQUEST
        return _write(req, msg)

    arg_file = file_content
    if hasattr(arg_file, 'read'):
        ## We've been passed a readable file, e.g. req
        arg_file = arg_file.read()
        if not arg_file:
            msg = "[ERROR] Please provide a body to your request."
            _log(msg)
            req.status = HTTP_BAD_REQUEST
            return _write(req, msg)
    else:
        if not arg_file:
            msg = "[ERROR] Please specify file body to input."
            _log(msg)
            req.status = HTTP_BAD_REQUEST
            return _write(req, msg)
        if hasattr(arg_file, "filename"):
            arg_file = arg_file.value
        else:
            msg = "[ERROR] 'file' parameter must be a (single) file"
            _log(msg)
            req.status = HTTP_BAD_REQUEST
            return _write(req, msg)

    # write temporary file:
    (fd, filename) = tempfile.mkstemp(prefix="batchupload_" + \
               time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_",
               dir=CFG_TMPSHAREDDIR)

    filedesc = os.fdopen(fd, 'w')
    filedesc.write(arg_file)
    filedesc.close()

    # check if this client can run this file:
    client_ip = _get_client_ip(req)
    permitted_dbcollids = _get_client_authorized_collections(client_ip)
    if '*' not in permitted_dbcollids: # wildcard
        allow = _check_client_can_submit_file(client_ip, filename, req, 0)
        if not allow:
            msg = "[ERROR] Cannot submit such a file from this IP. (Wrong collection.)"
            _log(msg)
            req.status = HTTP_FORBIDDEN
            return _write(req, msg)

    # check validity of marcxml
    xmlmarclint_path = CFG_BINDIR + '/xmlmarclint'
    xmlmarclint_output, dummy1, dummy2 = run_shell_command('%s %s' % (xmlmarclint_path, filename))
    if xmlmarclint_output != 0:
        msg = "[ERROR] MARCXML is not valid."
        _log(msg)
        req.status = HTTP_BAD_REQUEST
        return _write(req, msg)
    args = ['bibupload', "batchupload", arg_mode, filename]
    # run upload command
    if callback_url:
        args += ["--callback-url", callback_url]
        if nonce:
            args += ["--nonce", nonce]
        if special_treatment:
            args += ["--special-treatment", special_treatment]
    task_low_level_submission(*args)
    msg = "[INFO] %s" % ' '.join(args)
    _log(msg)
    return _write(req, msg)
Пример #55
0
def dump_database(dump_path, host=CFG_DATABASE_HOST, port=CFG_DATABASE_PORT, \
                  user=CFG_DATABASE_USER, passw=CFG_DATABASE_PASS, \
                  name=CFG_DATABASE_NAME, params=None, compress=False, \
                  ignore_tables=None):
    """
    Dump Invenio database into SQL file located at DUMP_PATH.

    Will perform the command to mysqldump with the given host configuration
    and user credentials.

    Optional mysqldump parameters can also be passed. Otherwise, a default
    set of parameters will be used.

    @param dump_path: path on the filesystem to save the dump to.
    @type dump_path: string

    @param host: hostname of mysql database node to connect to.
    @type host: string

    @param port: port of mysql database node to connect to
    @type port: string

    @param user: username to connect with
    @type user: string

    @param passw: password to connect to with
    @type passw: string

    @param name: name of mysql database node to dump
    @type name: string

    @param params: command line parameters to pass to mysqldump. Optional.
    @type params: string

    @param compress: should the dump be compressed through gzip?
    @type compress: bool

    @param ignore_tables: list of tables to ignore in the dump
    @type ignore: list of string
    """
    write_message("... writing %s" % (dump_path,))

    partial_dump_path = dump_path + ".part"

    # Is mysqldump installed or in the right path?
    cmd_prefix = CFG_PATH_MYSQL + 'dump'
    if not os.path.exists(cmd_prefix):
        raise StandardError("%s is not installed." % (cmd_prefix))

    if not params:
        # No parameters set, lets use the default ones.
        params = " --skip-opt --add-drop-table --add-locks --create-options" \
                 " --quick --extended-insert --set-charset --disable-keys" \
                 " --lock-tables=false --max_allowed_packet=2G "

    if ignore_tables:
        params += " ".join([escape_shell_arg("--ignore-table=%s.%s" % (CFG_DATABASE_NAME, table)) for table in ignore_tables])

    dump_cmd = "%s %s " \
               " --host=%s --port=%s --user=%s --password=%s %s" % \
               (cmd_prefix, \
                params, \
                escape_shell_arg(host), \
                escape_shell_arg(str(port)), \
                escape_shell_arg(user), \
                escape_shell_arg(passw), \
                escape_shell_arg(name))

    if compress:
        dump_cmd = "%s | %s -cf; exit ${PIPESTATUS[0]}" % \
                   (dump_cmd, \
                    CFG_PATH_GZIP)
        dump_cmd = "bash -c %s" % (escape_shell_arg(dump_cmd),)

    write_message(dump_cmd, verbose=2)

    exit_code, stdout, stderr = run_shell_command(dump_cmd, None, partial_dump_path)

    if exit_code:
        raise StandardError("ERROR: mysqldump exit code is %s. stderr: %s stdout: %s" % \
                            (repr(exit_code), \
                             repr(stderr), \
                             repr(stdout)))
    else:
        os.rename(partial_dump_path, dump_path)
        write_message("... completed writing %s" % (dump_path,))
Пример #56
0
def untar(original_tarball, sdir):
    """
    Here we decide if our file is actually a tarball (sometimes the
    'tarballs' gotten from arXiv aren't actually tarballs.  If they
    'contain' only the TeX file, then they are just that file.), then
    we untar it if so and decide which of its constituents are the
    TeX file and which are the images.

    @param: tarball (string): the name of the tar file from arXiv
    @param: dir (string): the directory where we would like it untarred to

    @return: (image_list, tex_file) (([string, string, ...], string)):
        list of images in the tarball and the name of the TeX file in the
        tarball.
    """

    if not tarfile.is_tarfile(original_tarball):
        return ([], [], None)

    tarball = tarfile.open(original_tarball)
    tarball.extractall(sdir)

    tex_output_contains = 'TeX'

    tex_file_extension = 'tex'
    image_output_contains = 'image'
    eps_output_contains = '- type eps'
    ps_output_contains = 'Postscript'

    file_list = []
    image_list = []
    might_be_tex = []

    for extracted_file in tarball.getnames():
        if extracted_file == '':
            break
        if extracted_file.startswith('./'):
            extracted_file = extracted_file[2:]
        # ensure we are actually looking at the right file
        extracted_file = os.path.join(sdir, extracted_file)

        # Add to full list of extracted files
        file_list.append(extracted_file)

        dummy1, cmd_out, dummy2 = run_shell_command('file %s', (extracted_file,))

        # is it TeX?
        if cmd_out.find(tex_output_contains) > -1:
            might_be_tex.append(extracted_file)

        # is it an image?
        elif cmd_out.lower().find(image_output_contains) > cmd_out.find(':') \
                or \
                cmd_out.lower().find(eps_output_contains) > cmd_out.find(':')\
                or \
                cmd_out.find(ps_output_contains) > cmd_out.find(':'):
            # we have "image" in the output, and it is not in the filename
            # i.e. filename.ext: blah blah image blah blah
            image_list.append(extracted_file)

        # if neither, maybe it is TeX or an image anyway, otherwise,
        # we don't care
        else:
            if extracted_file.split('.')[-1].lower() == tex_file_extension:
                # we might have tex source!
                might_be_tex.append(extracted_file)
            elif extracted_file.split('.')[-1] in ['eps', 'png', \
                    'ps', 'jpg', 'pdf']:
                # we might have an image!
                image_list.append(extracted_file)

    if might_be_tex == []:
        # well, that's tragic
        # could not find TeX file in tar archive
        return ([], [], [])
    return (file_list, image_list, might_be_tex)
Пример #57
0
def Move_Files_to_Storage(parameters, curdir, form, user_info=None):
    """
    The function moves files received from the standard submission's
    form through file input element(s). The document are assigned a
    'doctype' (or category) corresponding to the file input element
    (eg. a file uploaded throught 'DEMOPIC_FILE' will go to
    'DEMOPIC_FILE' doctype/category).

    Websubmit engine builds the following file organization in the
    directory curdir/files:

                  curdir/files
                        |
      _____________________________________________________________________
            |                                   |                          |
      ./file input 1 element's name      ./file input 2 element's name    ....
         (for eg. 'DEMOART_MAILFILE')       (for eg. 'DEMOART_APPENDIX')
         |                                     |
      test1.pdf                             test2.pdf


    There is only one instance of all possible extension(pdf, gz...) in each part
    otherwise we may encounter problems when renaming files.

    + parameters['rename']: if given, all the files in curdir/files
      are renamed.  parameters['rename'] is of the form:
      <PA>elemfilename[re]</PA>* where re is an regexp to select(using
      re.sub) what part of the elem file has to be selected.
      e.g: <PA>file:TEST_FILE_RN</PA>

    + parameters['documenttype']: if given, other formats are created.
      It has 2 possible values: - if "picture" icon in gif format is created
                                - if "fulltext" ps, gz .... formats are created

    + parameters['paths_and_suffixes']: directories to look into and
      corresponding suffix to add to every file inside. It must have
      the same structure as a Python dictionnary of the following form
      {'FrenchAbstract':'french', 'EnglishAbstract':''}

      The keys are the file input element name from the form <=>
      directories in curdir/files The values associated are the
      suffixes which will be added to all the files in
      e.g. curdir/files/FrenchAbstract

    + parameters['iconsize'] need only if 'icon' is selected in
      parameters['documenttype']

    + parameters['paths_and_restrictions']: the restrictions to apply
      to each uploaded file. The parameter must have the same
      structure as a Python dictionnary of the following form:
      {'DEMOART_APPENDIX':'restricted'}
      Files not specified in this parameter are not restricted.
      The specified restrictions can include a variable that can be
      replaced at runtime, for eg:
      {'DEMOART_APPENDIX':'restricted to <PA>file:SuE</PA>'}

    + parameters['paths_and_doctypes']: if a doctype is specified,
      the file will be saved under the 'doctype/collection' instead
      of under the default doctype/collection given by the name
      of the upload element that was used on the websubmit interface.
      to configure the doctype in websubmit, enter the value as in a
      dictionnary, for eg:
      {'PATHS_SWORD_UPL' : 'PUSHED_TO_ARXIV'} -> from
      Demo_Export_Via_Sword [DEMOSWR] Document Types
    """

    global sysno
    paths_and_suffixes = parameters['paths_and_suffixes']
    paths_and_restrictions = parameters['paths_and_restrictions']
    rename = parameters['rename']
    documenttype = parameters['documenttype']
    iconsizes = parameters['iconsize'].split(',')
    paths_and_doctypes = parameters['paths_and_doctypes']

    ## Create an instance of BibRecDocs for the current recid(sysno)
    bibrecdocs = BibRecDocs(sysno)

    paths_and_suffixes = get_dictionary_from_string(paths_and_suffixes)

    paths_and_restrictions = get_dictionary_from_string(paths_and_restrictions)

    paths_and_doctypes = get_dictionary_from_string(paths_and_doctypes)

    ## Go through all the directories specified in the keys
    ## of parameters['paths_and_suffixes']
    for path in paths_and_suffixes.keys():
        ## Check if there is a directory for the current path
        if os.path.exists("%s/files/%s" % (curdir, path)):
            ## Retrieve the restriction to apply to files in this
            ## directory
            restriction = paths_and_restrictions.get(path, '')
            restriction = re.sub('<PA>(?P<content>[^<]*)</PA>',
                                 get_pa_tag_content, restriction)

            ## Go through all the files in curdir/files/path
            for current_file in os.listdir("%s/files/%s" % (curdir, path)):
                ## retrieve filename and extension
                dummy, filename, extension = decompose_file(current_file)
                if extension and extension[0] != ".":
                    extension = '.' + extension
                if len(paths_and_suffixes[path]) != 0:
                    extension = "_%s%s" % (paths_and_suffixes[path], extension)
                ## Build the new file name if rename parameter has been given
                if rename:
                    filename = re.sub('<PA>(?P<content>[^<]*)</PA>', \
                                      get_pa_tag_content, \
                                      parameters['rename'])

                if rename or len(paths_and_suffixes[path]) != 0:
                    ## Rename the file
                    try:
                        # Write the log rename_cmd
                        fd = open("%s/rename_cmd" % curdir, "a+")
                        fd.write("%s/files/%s/%s" % (curdir, path, current_file) + " to " +\
                                  "%s/files/%s/%s%s" % (curdir, path, filename, extension) + "\n\n")
                        ## Rename
                        os.rename("%s/files/%s/%s" % (curdir, path, current_file), \
                                  "%s/files/%s/%s%s" % (curdir, path, filename, extension))

                        fd.close()
                        ## Save the new name in a text file in curdir so that
                        ## the new filename can be used by templates to created the recmysl
                        fd = open("%s/%s_RENAMED" % (curdir, path), "w")
                        fd.write("%s%s" % (filename, extension))
                        fd.close()
                    except OSError as err:
                        msg = "Cannot rename the file.[%s]"
                        msg %= str(err)
                        raise InvenioWebSubmitFunctionWarning(msg)
                fullpath = "%s/files/%s/%s%s" % (curdir, path, filename,
                                                 extension)
                ## Check if there is any existing similar file
                if not bibrecdocs.check_file_exists(fullpath, extension):
                    bibdoc = bibrecdocs.add_new_file(
                        fullpath,
                        doctype=paths_and_doctypes.get(path, path),
                        never_fail=True)
                    bibdoc.set_status(restriction)
                    ## Fulltext
                    if documenttype == "fulltext":
                        additionalformats = createRelatedFormats(fullpath)
                        if len(additionalformats) > 0:
                            for additionalformat in additionalformats:
                                try:
                                    bibrecdocs.add_new_format(additionalformat)
                                except InvenioBibDocFileError:
                                    pass
                    ## Icon
                    elif documenttype == "picture":
                        has_added_default_icon_subformat_p = False
                        for iconsize in iconsizes:
                            try:
                                iconpath, iconname = create_icon({
                                    'input-file':
                                    fullpath,
                                    'icon-scale':
                                    iconsize,
                                    'icon-name':
                                    None,
                                    'icon-file-format':
                                    None,
                                    'multipage-icon':
                                    False,
                                    'multipage-icon-delay':
                                    100,
                                    'verbosity':
                                    0,
                                })
                            except Exception as e:
                                register_exception(
                                    prefix=
                                    'Impossible to create icon for %s (record %s)'
                                    % (fullpath, sysno),
                                    alert_admin=True)
                                continue
                            iconpath = os.path.join(iconpath, iconname)
                            docname = decompose_file(fullpath)[1]
                            try:
                                mybibdoc = bibrecdocs.get_bibdoc(docname)
                            except InvenioBibDocFileError:
                                mybibdoc = None
                            if iconpath is not None and mybibdoc is not None:
                                try:
                                    icon_suffix = iconsize.replace(
                                        '>', '').replace('<', '').replace(
                                            '^', '').replace('!', '')
                                    if not has_added_default_icon_subformat_p:
                                        mybibdoc.add_icon(iconpath)
                                        has_added_default_icon_subformat_p = True
                                    else:
                                        mybibdoc.add_icon(
                                            iconpath,
                                            subformat=
                                            CFG_BIBDOCFILE_DEFAULT_ICON_SUBFORMAT
                                            + "-" + icon_suffix)
                                    ## Save the new icon filename in a text file in curdir so that
                                    ## it can be used by templates to created the recmysl
                                    try:
                                        if not has_added_default_icon_subformat_p:
                                            fd = open(
                                                "%s/%s_ICON" % (curdir, path),
                                                "w")
                                        else:
                                            fd = open(
                                                "%s/%s_ICON_%s" %
                                                (curdir, path,
                                                 iconsize + '_' + icon_suffix),
                                                "w")
                                        fd.write(os.path.basename(iconpath))
                                        fd.close()
                                    except OSError as err:
                                        msg = "Cannot store icon filename.[%s]"
                                        msg %= str(err)
                                        raise InvenioWebSubmitFunctionWarning(
                                            msg)
                                except InvenioBibDocFileError as e:
                                    # Most probably icon already existed.
                                    pass
                            elif mybibdoc is not None:
                                mybibdoc.delete_icon()

    # Update the MARC
    bibdocfile_bin = os.path.join(CFG_BINDIR, 'bibdocfile --yes-i-know')
    run_shell_command(bibdocfile_bin + " --fix-marc --recid=%s",
                      (str(sysno), ))

    # Delete the HB BibFormat cache in the DB, so that the fulltext
    # links do not point to possible dead files
    run_sql(
        "DELETE LOW_PRIORITY from bibfmt WHERE format='HB' AND id_bibrec=%s",
        (sysno, ))

    return ""
Пример #58
0
def cli_upload(req,
               file_content=None,
               mode=None,
               callback_url=None,
               nonce=None,
               special_treatment=None):
    """ Robot interface for uploading MARC files
    """
    req.content_type = "text/plain"

    # check IP and useragent:
    if not _get_client_authorized_collections(_get_client_ip(req)):
        msg = "[ERROR] Sorry, client IP %s cannot use the service." % _get_client_ip(
            req)
        _log(msg)
        req.status = HTTP_FORBIDDEN
        return _write(req, msg)
    if not _check_client_useragent(req):
        msg = "[ERROR] Sorry, the %s useragent cannot use the service." % _get_useragent(
            req)
        _log(msg)
        req.status = HTTP_FORBIDDEN
        return _write(req, msg)

    arg_mode = mode
    if not arg_mode:
        msg = "[ERROR] Please specify upload mode to use."
        _log(msg)
        req.status = HTTP_BAD_REQUEST
        return _write(req, msg)
    if arg_mode == '--insertorreplace':
        arg_mode = '-ir'
    if not arg_mode in PERMITTED_MODES:
        msg = "[ERROR] Invalid upload mode."
        _log(msg)
        req.status = HTTP_BAD_REQUEST
        return _write(req, msg)

    arg_file = file_content
    if hasattr(arg_file, 'read'):
        ## We've been passed a readable file, e.g. req
        arg_file = arg_file.read()
        if not arg_file:
            msg = "[ERROR] Please provide a body to your request."
            _log(msg)
            req.status = HTTP_BAD_REQUEST
            return _write(req, msg)
    else:
        if not arg_file:
            msg = "[ERROR] Please specify file body to input."
            _log(msg)
            req.status = HTTP_BAD_REQUEST
            return _write(req, msg)
        if hasattr(arg_file, "filename"):
            arg_file = arg_file.value
        else:
            msg = "[ERROR] 'file' parameter must be a (single) file"
            _log(msg)
            req.status = HTTP_BAD_REQUEST
            return _write(req, msg)

    # write temporary file:
    (fd, filename) = tempfile.mkstemp(prefix="batchupload_" + \
               time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_",
               dir=CFG_TMPSHAREDDIR)

    filedesc = os.fdopen(fd, 'w')
    filedesc.write(arg_file)
    filedesc.close()

    # check if this client can run this file:
    client_ip = _get_client_ip(req)
    permitted_dbcollids = _get_client_authorized_collections(client_ip)
    if '*' not in permitted_dbcollids:  # wildcard
        allow = _check_client_can_submit_file(client_ip, filename, req, 0)
        if not allow:
            msg = "[ERROR] Cannot submit such a file from this IP. (Wrong collection.)"
            _log(msg)
            req.status = HTTP_FORBIDDEN
            return _write(req, msg)

    # check validity of marcxml
    xmlmarclint_path = CFG_BINDIR + '/xmlmarclint'
    xmlmarclint_output, dummy1, dummy2 = run_shell_command(
        '%s %s' % (xmlmarclint_path, filename))
    if xmlmarclint_output != 0:
        msg = "[ERROR] MARCXML is not valid."
        _log(msg)
        req.status = HTTP_BAD_REQUEST
        return _write(req, msg)
    args = ['bibupload', "batchupload", arg_mode, filename]
    # run upload command
    if callback_url:
        args += ["--callback-url", callback_url]
        if nonce:
            args += ["--nonce", nonce]
        if special_treatment:
            args += ["--special-treatment", special_treatment]
    task_low_level_submission(*args)
    msg = "[INFO] %s" % ' '.join(args)
    _log(msg)
    return _write(req, msg)