def rotate_image(filename, line, sdir, image_list): """ Given a filename and a line, figure out what it is that the author wanted to do wrt changing the rotation of the image and convert the file so that this rotation is reflected in its presentation. @param: filename (string): the name of the file as specified in the TeX @param: line (string): the line where the rotate command was found @output: the image file rotated in accordance with the rotate command @return: True if something was rotated """ file_loc = get_image_location(filename, sdir, image_list) degrees = re.findall('(angle=[-\\d]+|rotate=[-\\d]+)', line) if len(degrees) < 1: return False degrees = degrees[0].split('=')[-1].strip() if file_loc == None or file_loc == 'ERROR' or\ not re.match('-*\\d+', degrees): return False degrees = str(0 - int(degrees)) cmd_list = ['mogrify', '-rotate', degrees, file_loc] dummy, dummy, cmd_err = run_process_with_timeout(cmd_list) if cmd_err != '': return True else: return True
def test_run_cmd_timeout_big_stdout(self): """shellutils - running simple command with a big standard output""" from invenio.config import CFG_PYLIBDIR test_file = os.path.join(CFG_PYLIBDIR, 'invenio', 'bibcirculation_templates.py') exitstatus, stdout, stderr = run_process_with_timeout(['cat', test_file], timeout=10) self.assertEqual(open(test_file).read(), stdout) self.assertEqual(exitstatus, 0)
def test_run_cmd_timeout_no_timeout(self): """shellutils - running simple command without expiring timeout""" exitstatus, stdout, stderr = run_process_with_timeout( [self.script_path, '5'], timeout=10) self.failUnless('foo' in stdout) self.failUnless('bar' in stderr) self.assertEqual(exitstatus, 0)
def execute_command_with_stderr(*args, **argd): """Wrapper to run_process_with_timeout.""" debug("Executing: %s" % (args, )) res, stdout, stderr = run_process_with_timeout(args, cwd=argd.get('cwd'), filename_out=argd.get('filename_out')) if res != 0: error("Error when executing %s" % (args, )) raise InvenioWebSubmitFileConverterError("Error in running %s\n stdout:\n%s\nstderr:\n%s\n" % (args, stdout, stderr)) return stdout, stderr
def test_run_cmd_timeout_pgid(self): """shellutils - running simple command should have PID == PGID""" exitstatus, stdout, stderr = run_process_with_timeout( [self.python_script_path, '5']) self.failIf( 'PID != PGID' in stdout, 'PID != PGID was found in current output: %s (%s)' % (stdout, stderr)) self.failUnless( 'PID == PGID' in stdout, 'PID == PGID wasn\'t found in current output: %s (%s)' % (stdout, stderr))
def extract_text(tarball): """ We check to see if there's a file called tarball.pdf, and, if there is, we run pdftotext on it. Simple as that. @param: tarball (string): the raw name of the tarball @return: None """ try: os.stat(tarball + '.pdf') cmd_list = ['pdftotext', tarball + '.pdf ', tarball + '.txt'] dummy1, dummy2, cmd_err = run_process_with_timeout(cmd_list) if cmd_err != '': return -1 write_message('generated ' + tarball + '.txt from ' + tarball + '.pdf') except: write_message('no text from ' + tarball + '.pdf')
def extract_text(tarball): """ We check to see if there's a file called tarball.pdf, and, if there is, we run pdftotext on it. Simple as that. @param: tarball (string): the raw name of the tarball @return: None """ try: os.stat(tarball + '.pdf') dummy1, dummy2, cmd_err = run_process_with_timeout('pdftotext %s %s' % \ (tarball + '.pdf ', tarball + '.txt'), shell = True) if cmd_err != '': return - 1 write_message('generated ' + tarball + '.txt from ' + tarball + '.pdf') except: write_message('no text from ' + tarball + '.pdf')
def convert_images(image_list): """ Here we figure out the types of the images that were extracted from the tarball and determine how to convert them into PNG. @param: image_list ([string, string, ...]): the list of image files extracted from the tarball in step 1 @return: image_list ([str, str, ...]): The list of image files when all have been converted to PNG format. """ png_output_contains = 'PNG image' ret_list = [] for image_file in image_list: if os.path.isdir(image_file): continue # FIXME: here and everywhere else in the plot extractor # library the run shell command statements should be (1) # called with timeout in order to prevent runaway imagemagick # conversions; (2) the arguments should be passed properly so # that they are escaped. dummy1, cmd_out, dummy2 = run_shell_command('file %s', (image_file,)) if cmd_out.find(png_output_contains) > -1: ret_list.append(image_file) else: # we're just going to assume that ImageMagick can convert all # the image types that we may be faced with # for sure it can do EPS->PNG and JPG->PNG and PS->PNG # and PSTEX->PNG converted_image_file = get_converted_image_name(image_file) try: dummy1, cmd_out, cmd_err = run_process_with_timeout('convert %s %s'\ % (image_file, \ converted_image_file), shell = True) if cmd_err == '': ret_list.append(converted_image_file) else: write_message('convert failed on ' + image_file) except Timeout: write_message('convert timed out on ' + image_file) return ret_list
def test_run_cmd_viasudo_no_password(self): """shellutils - running simple command via sudo should not wait for password""" exitstatus, stdout, stderr = run_process_with_timeout([self.script_path, '5'], timeout=10, sudo='foo') self.assertNotEqual(exitstatus, 0)
def test_run_cmd_timeout_pgid(self): """shellutils - running simple command should have PID == PGID""" exitstatus, stdout, stderr = run_process_with_timeout([self.python_script_path, '5']) self.failIf('PID != PGID' in stdout, 'PID != PGID was found in current output: %s (%s)' % (stdout, stderr)) self.failUnless('PID == PGID' in stdout, 'PID == PGID wasn\'t found in current output: %s (%s)' % (stdout, stderr))
def test_run_cmd_timeout_no_timeout(self): """shellutils - running simple command without expiring timeout""" exitstatus, stdout, stderr = run_process_with_timeout([self.script_path, '5'], timeout=10) self.failUnless('foo' in stdout) self.failUnless('bar' in stderr) self.assertEqual(exitstatus, 0)
def untar(original_tarball, sdir): """ Here we decide if our file is actually a tarball (sometimes the 'tarballs' gotten from arXiv aren't actually tarballs. If they 'contain' only the TeX file, then they are just that file.), then we untar it if so and decide which of its constituents are the TeX file and which are the images. @param: tarball (string): the name of the tar file from arXiv @param: dir (string): the directory where we would like it untarred to @return: (image_list, tex_file) (([string, string, ...], string)): list of images in the tarball and the name of the TeX file in the tarball. """ tarball = check_for_gzip(original_tarball) dummy1, cmd_out, cmd_err = run_shell_command('file %s', (tarball, )) tarball_output = 'tar archive' if re.search(tarball_output, cmd_out) == None: run_shell_command('rm %s', (tarball, )) return ([], [], None) cmd_list = ['tar', 'xvf', tarball, '-C', sdir] dummy1, cmd_out, cmd_err = run_process_with_timeout(cmd_list) if cmd_err != '': return ([], [], None) if original_tarball != tarball: run_shell_command('rm %s', (tarball, )) cmd_out = cmd_out.split('\n') tex_output_contains = 'TeX' tex_file_extension = 'tex' image_output_contains = 'image' eps_output_contains = '- type eps' ps_output_contains = 'Postscript' file_list = [] image_list = [] might_be_tex = [] for extracted_file in cmd_out: if extracted_file == '': break if extracted_file.startswith('./'): extracted_file = extracted_file[2:] # ensure we are actually looking at the right file extracted_file = os.path.join(sdir, extracted_file) # Add to full list of extracted files file_list.append(extracted_file) dummy1, cmd_out, dummy2 = run_shell_command('file %s', (extracted_file, )) # is it TeX? if cmd_out.find(tex_output_contains) > -1: might_be_tex.append(extracted_file) # is it an image? elif cmd_out.lower().find(image_output_contains) > cmd_out.find(':') \ or \ cmd_out.lower().find(eps_output_contains) > cmd_out.find(':')\ or \ cmd_out.find(ps_output_contains) > cmd_out.find(':'): # we have "image" in the output, and it is not in the filename # i.e. filename.ext: blah blah image blah blah image_list.append(extracted_file) # if neither, maybe it is TeX or an image anyway, otherwise, # we don't care else: if extracted_file.split('.')[-1].lower() == tex_file_extension: # we might have tex source! might_be_tex.append(extracted_file) elif extracted_file.split('.')[-1] in ['eps', 'png', \ 'ps', 'jpg', 'pdf']: # we might have an image! image_list.append(extracted_file) if might_be_tex == []: # well, that's tragic # could not find TeX file in tar archive return ([], [], []) return (file_list, image_list, might_be_tex)
def test_run_cmd_no_timeout(self): """shellutils - running simple command with non expiring timeout""" t1 = time.time() self.assertEqual(3, len(run_process_with_timeout((self.script_path, '5'), timeout=15)[1].split('\n'))) self.failUnless(time.time() - t1 < 7)
def untar(original_tarball, sdir): """ Here we decide if our file is actually a tarball (sometimes the 'tarballs' gotten from arXiv aren't actually tarballs. If they 'contain' only the TeX file, then they are just that file.), then we untar it if so and decide which of its constituents are the TeX file and which are the images. @param: tarball (string): the name of the tar file from arXiv @param: dir (string): the directory where we would like it untarred to @return: (image_list, tex_file) (([string, string, ...], string)): list of images in the tarball and the name of the TeX file in the tarball. """ tarball = check_for_gzip(original_tarball) dummy1, cmd_out, cmd_err = run_shell_command('file %s', (tarball,)) tarball_output = 'tar archive' if re.search(tarball_output, cmd_out) == None: run_shell_command('rm %s', (tarball,)) return ([], [], None) dummy1, cmd_out, cmd_err = run_process_with_timeout('tar xvf %s -C %s' % (tarball, sdir), shell = True) if cmd_err != '': return ([], [], None) if original_tarball != tarball: run_shell_command('rm %s', (tarball,)) cmd_out = cmd_out.split('\n') tex_output_contains = 'TeX' tex_file_extension = 'tex' image_output_contains = 'image' eps_output_contains = '- type eps' ps_output_contains = 'Postscript' file_list = [] image_list = [] might_be_tex = [] for extracted_file in cmd_out: if extracted_file == '': break if extracted_file.startswith('./'): extracted_file = extracted_file[2:] # ensure we are actually looking at the right file extracted_file = os.path.join(sdir, extracted_file) # Add to full list of extracted files file_list.append(extracted_file) dummy1, cmd_out, dummy2 = run_shell_command('file %s', (extracted_file,)) # is it TeX? if cmd_out.find(tex_output_contains) > -1: might_be_tex.append(extracted_file) # is it an image? elif cmd_out.lower().find(image_output_contains) > cmd_out.find(':') \ or \ cmd_out.lower().find(eps_output_contains) > cmd_out.find(':')\ or \ cmd_out.find(ps_output_contains) > cmd_out.find(':'): # we have "image" in the output, and it is not in the filename # i.e. filename.ext: blah blah image blah blah image_list.append(extracted_file) # if neither, maybe it is TeX or an image anyway, otherwise, # we don't care else: if extracted_file.split('.')[-1] == tex_file_extension: # we might have tex source! might_be_tex.append(extracted_file) elif extracted_file.split('.')[-1] in ['eps', 'png', \ 'ps', 'jpg', 'pdf']: # we might have an image! image_list.append(extracted_file) if might_be_tex == []: # well, that's tragic # could not find TeX file in tar archive return ([], [], []) return (file_list, image_list, might_be_tex)