def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryrun=False): # If we're updating, grab the existing database entry if update: graphic = db.session.query(GraphicsModel).filter(GraphicsModel.bibcode == bibcode).first() if not graphic: sys.stderr.write("Note: update for %s, but no existing record found!\n" % bibcode) else: graphic = None # First get lists of (La)TeX and image files tex_files, img_files, xdir = file_ops.untar(ft_file) # If we didn't find any image files, skip if len(img_files) == 0: return figures = [] # Next convert the image files # All the original images than cannot be converted will be # removed from the list of originals try: img_files, converted_images = file_ops.convert_images(img_files) except Exception, exc: sys.stderr.write("Image conversion barfed for %s. Skipping.\n" % bibcode) # Remove the temporary directory try: shutil.rmtree(xdir) except: pass return
def test_extract_captions(self): '''Test extracting captions''' if not self.app.config.get('GRAPHICS_ENABLE_UPDATES', False): return True from file_ops import untar from file_ops import convert_images from invenio_tools import extract_captions from invenio_tools import prepare_image_data from invenio_tools import extract_context # Assuming the previous tests succeeded, we know we can extract # the images fro the archive archive = "%s/tests/stubdata/arXiv/YY/NN.tar.gz" % PROJECT_HOME tex, imgs, sdir = untar(archive) # Now use the update machinery to convert the images to PNGs converted_images = convert_images(imgs) # Pick the right TeX file tex_file = [f for f in tex if f.split('.')[-1] == 'tex'][0] # Extract captions TMP = self.app.config.get('GRAPHICS_TMP_DIR') im_data = extract_captions(tex_file, TMP, converted_images) # Did we get what we expected expected = ('', 'noimgDistance to M~51', '') self.assertEqual(im_data[0], expected) # Check cleaned data cleaned = prepare_image_data(im_data, tex_file, converted_images) self.assertEqual(os.path.basename(cleaned[-1][0]), 'figure09.png') self.assertEqual(cleaned[-1][1], 'figure05.ps') self.assertEqual(cleaned[-1][2], '') # Check extracted context context = extract_context(tex_file, cleaned) expected = ('', 'noimgDistance to M~51', '', []) self.assertEqual(context[0], expected) # Cleanup the extracted data extract_dir = "%s/NN" % TMP shutil.rmtree(extract_dir)
def test_convert_images(self): '''Test converting images to PNG files''' if not self.app.config.get('GRAPHICS_ENABLE_UPDATES', False): return True from file_ops import untar from file_ops import convert_images import magic # Assuming the previous tests succeeded, we know we can extract # the images fro the archive archive = "%s/tests/stubdata/arXiv/YY/NN.tar.gz" % PROJECT_HOME tex, imgs, sdir = untar(archive) # Now use the update machinery to convert the images to PNGs remainder, converted_images = convert_images(imgs) # No files should have failed to convert self.assertEqual([os.path.basename(i) for i in imgs], [os.path.basename(i) for i in remainder]) # Did we get the expected PNGs? imgs_expected = ['figure01.png', 'figure02.png', 'figure03.png', 'figure04.png', 'figure05.png', 'figure06.png', 'figure07.png', 'figure08.png', 'figure09.png'] self.assertEqual([os.path.basename(i) for i in converted_images], imgs_expected) # And they really are all PNG files res = [magic.from_file(i).find('PNG') > -1 for i in converted_images] self.assertTrue(False not in res)
def test_convert_images(self): '''Test converting images to PNG files''' if not self.app.config.get('GRAPHICS_ENABLE_UPDATES', False): return True from file_ops import untar from file_ops import convert_images import magic # Assuming the previous tests succeeded, we know we can extract # the images fro the archive archive = "%s/tests/stubdata/arXiv/YY/NN.tar.gz" % PROJECT_HOME tex, imgs, sdir = untar(archive) # Now use the update machinery to convert the images to PNGs remainder, converted_images = convert_images(imgs) # No files should have failed to convert self.assertEqual([os.path.basename(i) for i in imgs], [os.path.basename(i) for i in remainder]) # Did we get the expected PNGs? imgs_expected = [ 'figure01.png', 'figure02.png', 'figure03.png', 'figure04.png', 'figure05.png', 'figure06.png', 'figure07.png', 'figure08.png', 'figure09.png' ] self.assertEqual([os.path.basename(i) for i in converted_images], imgs_expected) # And they really are all PNG files res = [magic.from_file(i).find('PNG') > -1 for i in converted_images] self.assertTrue(False not in res)
def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryrun=False): # If we're updating, grab the existing database entry if update: graphic = db.session.query(GraphicsModel).filter( GraphicsModel.bibcode == bibcode).first() if not graphic: sys.stderr.write( 'Note: update for %s, but no existing record found!\n' % bibcode) else: graphic = None # First get lists of (La)TeX and image files tex_files, img_files, xdir = file_ops.untar(ft_file) # If we didn't find any image files, skip if len(img_files) == 0: return figures = [] # Next convert the image files # All the original images than cannot be converted will be # removed from the list of originals try: img_files, converted_images = file_ops.convert_images(img_files) except Exception, exc: sys.stderr.write('Image conversion barfed for %s. Skipping.\n' % bibcode) # Remove the temporary directory try: shutil.rmtree(xdir) except: pass return